diff --git a/.gitignore b/.gitignore index 65f111587..ac3b931eb 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ dump/ lib/ out/ /atlassian-ide-plugin.xml +maven-metadata-local.xml +dependency-reduced-pom.xml diff --git a/ant-bridge.sh b/ant-bridge.sh new file mode 100755 index 000000000..9f4713d7c --- /dev/null +++ b/ant-bridge.sh @@ -0,0 +1,173 @@ +#!/bin/sh + +mvn_args="verify" +mvn_properties= +mvn_clean= +unknown_args= +property_regex='-D(.*)=(.*)' +unit_test_regex='.*UnitTest' +post_script= +run_type="run" + +for arg in "${@}" ; do + if [[ "${arg}" == "dry" ]] ; then + run_type="dry" + + elif [[ "${arg}" == "clean" ]] ; then + mvn_clean="clean" + mvn_args= + + elif [[ "${arg}" =~ ${property_regex} ]] ; then + property_name=${BASH_REMATCH[1]} + property_value=${BASH_REMATCH[2]} + + if [[ "${property_name}" == "single" ]] ; then + test_property="test" + test_disabled="it.test" + if [[ ! "${property_value}" =~ ${unit_test_regex} ]] ; then + test_property="it.test" + test_disabled="test" + fi + + mvn_properties="${mvn_properties} -D${test_disabled}=disabled -D${test_property}=${property_value}" + + elif [[ "${property_name}" == "test.debug.port" ]] ; then + mvn_properties="${mvn_properties} -Dmaven.surefire.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\"" + mvn_properties="${mvn_properties} -Dmaven.failsafe.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\"" + + elif [[ "${property_name}" == "test.default.maxmemory" ]] ; then + mvn_properties="${mvn_properties} -Dtest.maxmemory=${property_value}" + + else + unknown_args="${unknown_args} \"${arg}\"" + + fi + + else + if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then + echo "Sorry, this script does not currently support mixing targets." >&2 + exit 1 + + elif [[ "${arg}" == "dist" ]] ; then + mvn_args="verify" + + elif [[ "${arg}" == "gatk" ]] ; then + mvn_args="verify '-P!queue'" + + elif [[ "${arg}" == "test.compile" ]] ; then + mvn_args="test-compile" + + elif [[ "${arg}" == "gatkdocs" ]] ; then + local_repo="sitetemprepo" + mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue" + + elif [[ "${arg}" == "package.gatk.full" ]] ; then + mvn_args="package '-P!private,!queue'" + + elif [[ "${arg}" == "package.gatk.all" ]] ; then + mvn_args="package '-P!queue'" + + elif [[ "${arg}" == "package.queue.full" ]] ; then + mvn_args="package '-P!private'" + + elif [[ "${arg}" == "package.queue.all" ]] ; then + mvn_args="package" + +# elif [[ "${arg}" == "release.gatk.full" ]] ; then +# mvn_args="package '-P!private,!queue'" +# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2" + +# elif [[ "${arg}" == "release.queue.full" ]] ; then +# mvn_args="package '-P!private'" +# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2" + + elif [[ "${arg}" == "build-picard-private" ]] ; then + mvn_args="mvn install -f private/picard-maven/pom.xml" + + # TODO: clover support + # see ant and maven docs for clover: + # https://confluence.atlassian.com/display/CLOVER/1.+QuickStart+Guide + # https://confluence.atlassian.com/display/CLOVER/Clover-for-Maven+2+and+3+User%27s+Guide + # + #elif [[ "${arg}" == "clover.report" ]] ; then + # mvn_args=... + # + #elif [[ "${arg}" == "with.clover" ]] ; then + # mvn_args=... + + # TODO: This runs *all* commit tests, including the few on Queue. + elif [[ "${arg}" == "gatkfull.binary.release.tests" ]] ; then + local_repo="sitetemprepo" + mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify" + mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}" + mvn_args="${mvn_args} -Dsting.packagetests.enabled=true" + mvn_args="${mvn_args} -Dsting.packagecommittests.skipped=false" + + # TODO: This runs only the pipeline tests (full, non-dry run), but not the commit tests for Queue. + elif [[ "${arg}" == "queuefull.binary.release.tests" ]] ; then + local_repo="sitetemprepo" + mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify" + mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}" + mvn_args="${mvn_args} -Dsting.packagetests.enabled=true" + mvn_args="${mvn_args} -Dsting.packagepipelinetests.skipped=false" + mvn_args="${mvn_args} -Dsting.pipelinetests.run=true" + + elif [[ "${arg}" == "committests" ]] ; then + mvn_args="verify -Dsting.committests.skipped=false" + + elif [[ "${arg}" == "test" ]] ; then + mvn_args="test -Dsting.unittests.skipped=false" + + elif [[ "${arg}" == "unittest" ]] ; then + mvn_args="test -Dsting.unittests.skipped=false" + + elif [[ "${arg}" == "integrationtest" ]] ; then + mvn_args="verify -Dsting.integrationtests.skipped=false" + + elif [[ "${arg}" == "largescaletest" ]] ; then + mvn_args="verify -Dsting.largescaletests.skipped=false" + + elif [[ "${arg}" == "knowledgebasetest" ]] ; then + mvn_args="verify -Dsting.knowledgebasetests.skipped=false" + + elif [[ "${arg}" == "pipelinetest" ]] ; then + mvn_args="verify -Dsting.pipelinetests.skipped=false" + + elif [[ "${arg}" == "pipelinetestrun" ]] ; then + mvn_args="verify -Dsting.pipelinetests.skipped=false -Dsting.pipelinetests.run=true" + + elif [[ "${arg}" == "fasttest" ]] ; then + mvn_args="verify -Dsting.committests.skipped=false -pl private/gatk-private -am -Dresource.bundle.skip=true" + + else + unknown_args="${unknown_args} \"${arg}\"" + + fi + + fi + +done + +mvn_cmd= +if [[ "${mvn_clean}" != "" ]] ; then + if [[ "${mvn_args}" != "" ]] ; then + mvn_cmd="mvn ${mvn_clean} && mvn ${mvn_args}" + else + mvn_cmd="mvn ${mvn_clean}" + fi +else + mvn_cmd="mvn ${mvn_args}" +fi + +if [[ "${unknown_args}" != "" ]] ; then + echo "Unrecognized arguments:${unknown_args}" >&2 + +else + echo "Equivalent maven command" + echo "${mvn_cmd}${mvn_properties}${post_script}" + + if [[ "${run_type}" != "dry" ]] ; then + sh -c "${mvn_cmd}${mvn_properties}${post_script}" + fi + +fi diff --git a/build.xml b/build.xml deleted file mode 100644 index fd0801bfb..000000000 --- a/build.xml +++ /dev/null @@ -1,1518 +0,0 @@ - - - - - Compile and distribute the Sting toolkit - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Generating Queue GATK extensions... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Building Scala... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/ivy.xml b/ivy.xml deleted file mode 100644 index 2e45247ab..000000000 --- a/ivy.xml +++ /dev/null @@ -1,117 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pom.xml b/pom.xml new file mode 100644 index 000000000..d899506b5 --- /dev/null +++ b/pom.xml @@ -0,0 +1,858 @@ + + + 4.0.0 + + + + + org.broadinstitute.sting + sting-root + 3.0 + public/sting-root + + + sting-aggregator + pom + Sting Aggregator + + + public + + + + + ${project.basedir} + StingText.properties + false + + -build-timestamp "${maven.build.timestamp}" + + + package + generate-resources + process-resources + process-test-resources + + + true + ${sting.packagecommittests.skipped} + ${sting.packagecommittests.skipped} + ${sting.packagecommittests.skipped} + true + true + + + true + ${sting.serialcommittests.skipped} + ${sting.serialcommittests.skipped} + ${sting.serialcommittests.skipped} + true + true + + + + + com.sun + tools + + + + + + + + + org.apache.maven.plugins + maven-clean-plugin + + + + + gatkdocs + + + ${basedir} + + javadoc.sh + options + packages + + + + + ${basedir} + + dependency-reduced-pom.xml + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-direct-dependencies + + unpack-dependencies + + none + + true + ${project.build.outputDirectory} + jar + system + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + default-resources + + resources + + ${sting.process-resources.phase} + + + default-testResources + + testResources + + ${sting.process-test-resources.phase} + + + copy-resource-bundle-log4j + + copy-resources + + none + + ${project.reporting.outputDirectory}/apidocs + + + ${sting.basedir}/sting-utils/src/main/config/org/broadinstitute/sting/utils/help + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + + javadoc + + none + + + ${resource.bundle.skip} + org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet + + ${project.build.outputDirectory} + + ${project.groupId} + + gatk-framework + ${project.version} + + 2g + false + true + -build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} -out ${project.build.outputDirectory}/${resource.bundle.path} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + none + + com.google.java.contract.core.apt.AnnotationProcessor + + + + + default-compile + none + + + default-testCompile + none + + + + compile-package-info + + compile + + compile + + + -Xpkginfo:always + + + **/package-info.java + + + + + + compile-java + + compile + + compile + + + + **/package-info.java + + + + + + testCompile-java + + testCompile + + test-compile + + + + + + org.scala-tools + maven-scala-plugin + + + + compile + testCompile + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + default-jar + ${sting.jar.phase} + + + test-jar + + test-jar + + ${sting.jar.phase} + + true + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + sting-executable + + shade + + none + + true + + + org.broadinstitute.sting:gsalib:tar.gz:* + org.broadinstitute.sting:*:tar.bz2:example-resources + + + + + + ${app.main.class} + + + + ${resource.bundle.path} + + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + + single + + none + + + src/main/assembly/example-resources.xml + + + + + binary-dist + + single + + none + + + src/main/assembly/binary-dist.xml + + + + + + + + + com.pyx4j + maven-junction-plugin + + + link-public-testdata + + link + + none + + + + ${basedir}/public/testdata + ${sting.basedir}/public/gatk-framework/src/test/resources + + + + + + unlink-public-testdata + + unlink + + none + + + + ${basedir}/public/testdata + ${sting.basedir}/public/gatk-framework/src/test/resources + + + + + + link-private-testdata + + link + + none + + + + ${basedir}/private/testdata + ${sting.basedir}/private/gatk-private/src/test/resources + + + + + + unlink-private-testdata + + unlink + + none + + + + ${basedir}/private/testdata + ${sting.basedir}/private/gatk-private/src/test/resources + + + + + + link-public-qscript + + link + + none + + + + ${basedir}/public/scala/qscript + ${sting.basedir}/public/queue-framework/src/main/qscripts + + + + + + unlink-public-qscript + + unlink + + none + + + + ${basedir}/public/scala/qscript + ${sting.basedir}/public/queue-framework/src/main/qscripts + + + + + + link-private-qscript + + link + + none + + + + ${basedir}/private/scala/qscript + ${sting.basedir}/private/queue-private/src/main/qscripts + + + + + + unlink-private-qscript + + unlink + + none + + + + ${basedir}/private/scala/qscript + ${sting.basedir}/private/queue-private/src/main/qscripts + + + + + + link-binary-jar + + link + + none + + + + ${sting.basedir}/target/${sting.binary-dist.name}.${project.packaging} + ${project.build.directory}/${project.build.finalName}.${project.packaging} + + + + + + link-git-release + + link + + none + + + + ${project.build.directory}/${sting.binary-dist.name}-${build.version}.tar.bz2 + ${project.build.directory}/${project.build.finalName}-binary-dist.tar.bz2 + + + + + + + + org.apache.maven.plugins + maven-invoker-plugin + + true + false + ${sting.basedir}/public/package-tests/pom.xml + true + true + ${sting.basedir}/${maven.repo.local} + + ${test} + ${it.test} + false + false + ${sting.packagetests.artifactId} + ${project.build.testOutputDirectory} + ${project.basedir} + ${sting.pipelinetests.run} + ${maven.surefire.debug} + ${maven.failsafe.debug} + + + + + + package-unittests + + run + + + + test + + ${project.build.directory}/invoker-reports/unit/${test} + ${sting.packageunittests.skipped} + + true + ${sting.packageunittests.skipped} + + + + + package-integrationtests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/integration/${it.test} + ${sting.packageintegrationtests.skipped} + + true + ${sting.packageintegrationtests.skipped} + + + + + package-pipelinetests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/pipeline/${it.test} + ${sting.packagepipelinetests.skipped} + + true + ${sting.packagepipelinetests.skipped} + + + + + package-largescaletests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/largescale/${it.test} + ${sting.packagelargescaletests.skipped} + + true + ${sting.packagelargescaletests.skipped} + + + + + package-knowledgebasetests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/knowledgebase/${it.test} + ${sting.packageknowledgebasetests.skipped} + + true + ${sting.packageknowledgebasetests.skipped} + + + + + + + org.apache.maven.plugins + maven-install-plugin + 2.5 + + + install-package + + install-file + + none + + true + ${project.groupId} + ${project.artifactId} + ${project.version} + ${project.packaging} + ${project.build.directory}/${project.build.finalName}.${project.packaging} + + + + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + com.pyx4j + maven-junction-plugin + + + link-public-testdata + process-test-resources + + + unlink-public-testdata + clean + + + link-public-qscript + process-test-resources + + + unlink-public-qscript + clean + + + + + org.apache.maven.plugins + maven-clean-plugin + + + com.google.code.sortpom + maven-sortpom-plugin + + + package-tests + + sort + + verify + false + + public/package-tests/pom.xml + + + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + + + + + generate-gatk-docs + + aggregate + + + false + + org.broadinstitute.sting.utils.help.GATKDoclet + + ${project.groupId} + gatk-package + ${project.version} + + false + true + private + -build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} ${gatkdocs.include.hidden} -settings-dir ${sting.basedir}/settings/helpTemplates -destination-dir ${project.build.directory}/gatkdocs + + + + + + + + + + + protected + + + ${basedir}/protected/pom.xml + + + + protected + + + + + + private + + + ${basedir}/private/pom.xml + + + + private + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + link-private-qscript + process-test-resources + + + unlink-private-qscript + clean + + + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + true + true + none + none + none + none + + + + + diff --git a/protected/gatk-protected/pom.xml b/protected/gatk-protected/pom.xml new file mode 100644 index 000000000..26aabd187 --- /dev/null +++ b/protected/gatk-protected/pom.xml @@ -0,0 +1,139 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + gatk-protected + jar + GATK Protected + + + ${project.basedir}/../.. + gatk-package + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + + net.sf.jgrapht + jgrapht + + + + gov.nist.math + jama + + + + it.unimi.dsi + fastutil + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-resource-bundle-log4j + prepare-package + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + prepare-package + + + + + org.apache.maven.plugins + maven-invoker-plugin + + + package-unittests + + + package-integrationtests + + + package-largescaletests + + + package-knowledgebasetests + + + package-pipelinetests + + + + + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + + + + + + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java new file mode 100644 index 000000000..29cee9e15 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java @@ -0,0 +1,114 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Total (unfiltered) depth over all samples. + * + *

While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's + * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth + * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for + * N samples with -dcov D is N * D + *

+ */ +public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + + int depth = 0; + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; + + for ( Map.Entry sample : stratifiedContexts.entrySet() ) + depth += sample.getValue().getBasePileup().depthOfCoverage(); + } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { + depth += maps.getLikelihoodReadMap().size(); + } + } + else + return null; + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", depth)); + return map; + } + + public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } + + public List getDescriptions() { + return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0))); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java new file mode 100644 index 000000000..52b09d251 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -0,0 +1,164 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + + +/** + * The depth of coverage of each allele per sample + * + *

The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this + * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the + * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of + * REF and ALT fields) is the unfiltered count of all reads that carried with them the + * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the + * power I have to determine the genotype of the sample at this site, while the AD tells me how many times + * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering + * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like + * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would + * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that + * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. + * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are + * many non-informative reads.

+ * + *

Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation, + * one should not base assumptions about the underlying genotype based on it; + * instead, the genotype likelihoods (PLs) are what determine the genotype calls.

+ * + */ +public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { + + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + return; + + if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) + annotateWithLikelihoods(alleleLikelihoodMap, vc, gb); + else if ( stratifiedContext != null && (vc.isSNP())) + annotateWithPileup(stratifiedContext, vc, gb); + } + + private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { + + final HashMap alleleCounts = new HashMap<>(); + for ( final Allele allele : vc.getAlleles() ) + alleleCounts.put(allele.getBases()[0], 0); + + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); + for ( final PileupElement p : pileup ) { + if ( alleleCounts.containsKey(p.getBase()) ) + alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); + } + + // we need to add counts in the correct order + final int[] counts = new int[alleleCounts.size()]; + counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); + for (int i = 0; i < vc.getAlternateAlleles().size(); i++) + counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); + + gb.AD(counts); + } + + private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) { + final Set alleles = new HashSet<>(vc.getAlleles()); + + // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext + if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) + throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); + + final HashMap alleleCounts = new HashMap<>(); + for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } + + for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); + if (! a.isInformative() ) continue; // read is non-informative + final GATKSAMRecord read = el.getKey(); + final int prevCount = alleleCounts.get(a.getMostLikelyAllele()); + alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1); + } + + final int[] counts = new int[alleleCounts.size()]; + counts[0] = alleleCounts.get(vc.getReference()); + for (int i = 0; i < vc.getAlternateAlleles().size(); i++) + counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); + + gb.AD(counts); + } + + public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } + + public List getDescriptions() { + return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0))); + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java new file mode 100644 index 000000000..8e5ca83e0 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java @@ -0,0 +1,126 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; + +import java.util.*; + + +/** + * The depth of coverage for informative reads for each sample. + * + * An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a + * case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear + * whether the read contains the reference allele or e.g. an extra repeat. + * The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can + * only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator). + * This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field + * (i.e., sum(AD) = DP). + */ +public class DepthPerSampleHC extends GenotypeAnnotation { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + return; + + if (alleleLikelihoodMap == null ) + throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller"); + + // the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot + // differentiate between reads that align over the event but aren't informative vs. those that aren't even + // close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP). + int dp = 0; + + if ( alleleLikelihoodMap.isEmpty() ) { + // there are no reads + } else { + final Set alleles = new HashSet<>(vc.getAlleles()); + + // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext + if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) + throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet()); + + for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); + if ( a.isInformative() ) { + dp++; + } + } + + gb.DP(dp); + } + } + + public List getKeyNames() { + return Collections.singletonList(VCFConstants.DEPTH_KEY); + } + + public List getDescriptions() { + return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY)); + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java new file mode 100644 index 000000000..a90f555a1 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -0,0 +1,509 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import cern.jet.math.Arithmetic; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + + +/** + * Phred-scaled p-value using Fisher's Exact Test to detect strand bias + * + *

Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation + * being seen on only the forward or only the reverse strand) in the reads. More bias is + * indicative of false positive calls. + *

+ * + *

Caveat

+ *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

+ */ +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static boolean ENABLE_DEBUGGING = false; + private final static Logger logger = Logger.getLogger(FisherStrand.class); + + private static final String FS = "FS"; + private static final double MIN_PVALUE = 1E-320; + private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; + private static final int MIN_COUNT = 2; + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if ( !vc.isVariant() ) + return null; + + if ( vc.hasGenotypes() ) { + final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); + if ( tableFromPerSampleAnnotations != null ) { + return pValueForBestTable(tableFromPerSampleAnnotations, null); + } + } + + if (vc.isSNP() && stratifiedContexts != null) { + final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); + final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); + printTable("unfiltered", tableNoFiltering); + printTable("filtered", tableFiltering); + return pValueForBestTable(tableFiltering, tableNoFiltering); + } + else if (stratifiedPerReadAlleleLikelihoodMap != null) { + // either SNP with no alignment context, or indels: per-read likelihood map needed + final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); +// logger.info("VC " + vc); +// printTable(table, 0.0); + return pValueForBestTable(table, null); + } + else + // for non-snp variants, we need per-read likelihoods. + // for snps, we can get same result from simple pileup + return null; + } + + /** + * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together + * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation + * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation + */ + private int[][] getTableFromSamples( final GenotypesContext genotypes ) { + if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } + + final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse + boolean foundData = false; + + for( final Genotype g : genotypes ) { + if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) + continue; + + foundData = true; + final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); + final int[] data = encodeSBBS(sbbsString); + if ( passesMinimumThreshold(data) ) { + for( int index = 0; index < sbArray.length; index++ ) { + sbArray[index] += data[index]; + } + } + } + + return ( foundData ? decodeSBBS(sbArray) : null ); + } + + /** + * Does this strand data array pass the minimum threshold for inclusion? + * + * @param data the array + * @return true if it passes the minimum threshold, false otherwise + */ + private static boolean passesMinimumThreshold(final int[] data) { + // the ref and alt totals must each be greater than MIN_COUNT + return data[0] + data[1] > MIN_COUNT && data[2] + data[3] > MIN_COUNT; + } + + /** + * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 + * + * @param table1 a contingency table, may be null + * @param table2 a contingency table, may be null + * @return annotation result for FS given tables + */ + private Map pValueForBestTable(final int[][] table1, final int[][] table2) { + if ( table2 == null ) + return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1)); + else if (table1 == null) + return annotationForOneTable(pValueForContingencyTable(table2)); + else { // take the one with the best (i.e., least significant pvalue) + double pvalue1 = pValueForContingencyTable(table1); + double pvalue2 = pValueForContingencyTable(table2); + return annotationForOneTable(Math.max(pvalue1, pvalue2)); + } + } + + /** + * Returns an annotation result given a pValue + * + * @param pValue + * @return a hash map from FS -> phred-scaled pValue + */ + private Map annotationForOneTable(final double pValue) { + final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs + return Collections.singletonMap(FS, value); + } + + public List getKeyNames() { + return Collections.singletonList(FS); + } + + public List getDescriptions() { + return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + } + + /** + * Helper function to turn the FisherStrand table into the SB annotation array + * @param table the table used by the FisherStrand annotation + * @return the array used by the per-sample Strand Bias annotation + */ + public static List getContingencyArray( final int[][] table ) { + if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + final List list = new ArrayList<>(4); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change + list.add(table[0][0]); + list.add(table[0][1]); + list.add(table[1][0]); + list.add(table[1][1]); + return list; + } + + /** + * Helper function to parse the genotype annotation into the SB annotation array + * @param string the string that is returned by genotype.getAnnotation("SB") + * @return the array used by the per-sample Strand Bias annotation + */ + private static int[] encodeSBBS( final String string ) { + final int[] array = new int[4]; + final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); + for( int index = 0; index < 4; index++ ) { + array[index] = Integer.parseInt(tokenizer.nextToken()); + } + return array; + } + + /** + * Helper function to turn the SB annotation array into the FisherStrand table + * @param array the array used by the per-sample Strand Bias annotation + * @return the table used by the FisherStrand annotation + */ + private static int[][] decodeSBBS( final int[] array ) { + if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } + final int[][] table = new int[2][2]; + table[0][0] = array[0]; + table[0][1] = array[1]; + table[1][0] = array[2]; + table[1][1] = array[3]; + return table; + } + + private Double pValueForContingencyTable(int[][] originalTable) { + final int[][] normalizedTable = normalizeContingencyTable(originalTable); + + int[][] table = copyContingencyTable(normalizedTable); + + double pCutoff = computePValue(table); + //printTable(table, pCutoff); + + double pValue = pCutoff; + while (rotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + table = copyContingencyTable(normalizedTable); + while (unrotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + //System.out.printf("P-cutoff: %f\n", pCutoff); + //System.out.printf("P-value: %f\n\n", pValue); + + // min is necessary as numerical precision can result in pValue being slightly greater than 1.0 + return Math.min(pValue, 1.0); + } + + // how large do we want the normalized table to be? + private static final double TARGET_TABLE_SIZE = 200.0; + + /** + * Normalize the table so that the entries are not too large. + * Note that this method does NOT necessarily make a copy of the table being passed in! + * + * @param table the original table + * @return a normalized version of the table or the original table if it is already normalized + */ + private static int[][] normalizeContingencyTable(final int[][] table) { + final int sum = table[0][0] + table[0][1] + table[1][0] + table[1][1]; + if ( sum <= TARGET_TABLE_SIZE * 2 ) + return table; + + final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE; + + final int[][] normalized = new int[2][2]; + for ( int i = 0; i < 2; i++ ) { + for ( int j = 0; j < 2; j++ ) + normalized[i][j] = (int)(table[i][j] / normalizationFactor); + } + + return normalized; + } + + private static int [][] copyContingencyTable(int [][] t) { + int[][] c = new int[2][2]; + + for ( int i = 0; i < 2; i++ ) + for ( int j = 0; j < 2; j++ ) + c[i][j] = t[i][j]; + + return c; + } + + + private static void printTable(int[][] table, double pValue) { + logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); + } + + /** + * Printing information to logger.info for debugging purposes + * + * @param name the name of the table + * @param table the table itself + */ + private void printTable(final String name, final int[][] table) { + if ( ENABLE_DEBUGGING ) { + final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); + logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", + name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); + } + } + + private static boolean rotateTable(int[][] table) { + table[0][0] -= 1; + table[1][0] += 1; + + table[0][1] += 1; + table[1][1] -= 1; + + return (table[0][0] >= 0 && table[1][1] >= 0); + } + + private static boolean unrotateTable(int[][] table) { + table[0][0] += 1; + table[1][0] -= 1; + + table[0][1] -= 1; + table[1][1] += 1; + + return (table[0][1] >= 0 && table[1][0] >= 0); + } + + private static double computePValue(int[][] table) { + + int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; + int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; + int N = rowSums[0] + rowSums[1]; + + // calculate in log space so we don't die with high numbers + double pCutoff = Arithmetic.logFactorial(rowSums[0]) + + Arithmetic.logFactorial(rowSums[1]) + + Arithmetic.logFactorial(colSums[0]) + + Arithmetic.logFactorial(colSums[1]) + - Arithmetic.logFactorial(table[0][0]) + - Arithmetic.logFactorial(table[0][1]) + - Arithmetic.logFactorial(table[1][0]) + - Arithmetic.logFactorial(table[1][1]) + - Arithmetic.logFactorial(N); + return Math.exp(pCutoff); + } + + private static int sumRow(int[][] table, int column) { + int sum = 0; + for (int r = 0; r < table.length; r++) { + sum += table[r][column]; + } + + return sum; + } + + private static int sumColumn(int[][] table, int row) { + int sum = 0; + for (int c = 0; c < table[row].length; c++) { + sum += table[row][c]; + } + + return sum; + } + + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } + if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + + final Allele ref = vc.getReference(); + final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); + final int[][] table = new int[2][2]; + + for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + final int[] myTable = new int[4]; + for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final GATKSAMRecord read = el.getKey(); + updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt); + } + if ( passesMinimumThreshold(myTable) ) + copyToMainTable(myTable, table); + } + + return table; + } + + /** + * Helper method to copy the per-sample table to the main table + * + * @param perSampleTable per-sample table (single dimension) + * @param mainTable main table (two dimensions) + */ + private static void copyToMainTable(final int[] perSampleTable, final int[][] mainTable) { + mainTable[0][0] += perSampleTable[0]; + mainTable[0][1] += perSampleTable[1]; + mainTable[1][0] += perSampleTable[2]; + mainTable[1][1] += perSampleTable[3]; + } + + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getSNPContingencyTable(final Map stratifiedContexts, + final Allele ref, + final Allele alt, + final int minQScoreToConsider ) { + int[][] table = new int[2][2]; + + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + final int[] myTable = new int[4]; + for (PileupElement p : sample.getValue().getBasePileup()) { + + if ( ! isUsableBase(p) ) // ignore deletions and bad MQ + continue; + + if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) + continue; + + updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt); + } + if ( passesMinimumThreshold(myTable) ) + copyToMainTable(myTable, table); + } + + return table; + } + + /** + * Can the base in this pileup element be used in comparative tests? + * + * @param p the pileup element to consider + * + * @return true if this base is part of a meaningful read for comparison, false otherwise + */ + private static boolean isUsableBase(final PileupElement p) { + return !( p.isDeletion() || + p.getMappingQual() == 0 || + p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || + ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); + } + + private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) { + + final boolean matchesRef = allele.equals(ref, true); + final boolean matchesAlt = allele.equals(alt, true); + + if ( matchesRef || matchesAlt ) { + final int offset = matchesRef ? 0 : 2; + + if ( read.isStrandless() ) { + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + table[offset]++; + table[offset + 1]++; + } else { + // a normal read with an actual strand + final boolean isFW = !read.getReadNegativeStrandFlag(); + table[offset + (isFW ? 0 : 1)]++; + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java new file mode 100644 index 000000000..7ebbd49dd --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -0,0 +1,191 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +/** + * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. + * + * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing + * reads associated with the samples with polymorphic genotypes. + */ +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +// private final static Logger logger = Logger.getLogger(QualByDepth.class); + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + if ( !vc.hasLog10PError() ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + int standardDepth = 0; + int ADrestrictedDepth = 0; + + for ( final Genotype genotype : genotypes ) { + + // we care only about variant calls with likelihoods + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + // if we have the AD values for this sample, let's make sure that the variant depth is greater than 1! + // TODO -- If we like how this is working and want to apply it to a situation other than the single sample HC pipeline, + // TODO -- then we will need to modify the annotateContext() - and related - routines in the VariantAnnotatorEngine + // TODO -- so that genotype-level annotations are run first (to generate AD on the samples) and then the site-level + // TODO -- annotations must come afterwards (so that QD can use the AD). + if ( genotype.hasAD() ) { + final int[] AD = genotype.getAD(); + final int totalADdepth = (int)MathUtils.sum(AD); + if ( totalADdepth - AD[0] > 1 ) + ADrestrictedDepth += totalADdepth; + standardDepth += totalADdepth; + continue; + } + + if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + standardDepth += context.getBasePileup().depthOfCoverage(); + + } else if (perReadAlleleLikelihoodMap != null) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) + continue; + + standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + } else if ( genotype.hasDP() ) { + standardDepth += genotype.getDP(); + } + } + + // if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward + if ( ADrestrictedDepth > 0 ) + standardDepth = ADrestrictedDepth; + + if ( standardDepth == 0 ) + return null; + + final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); + // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a + // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper) + double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null)); + + // Hack: see note in the fixTooHighQD method below + QD = fixTooHighQD(QD); + + final Map map = new HashMap<>(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + + /** + * Generate the indel normalization factor. + * + * @param altAlleleLength the average alternate allele length for the call + * @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length? + * @return a possitive double + */ + private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) { + return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0); + } + + /** + * The haplotype caller generates very high quality scores when multiple events are on the + * same haplotype. This causes some very good variants to have unusually high QD values, + * and VQSR will filter these out. This code looks at the QD value, and if it is above + * threshold we map it down to the mean high QD value, with some jittering + * + * // TODO -- remove me when HaplotypeCaller bubble caller is live + * + * @param QD the raw QD score + * @return a QD value + */ + private double fixTooHighQD(final double QD) { + if ( QD < MAX_QD_BEFORE_FIXING ) { + return QD; + } else { + return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + } + } + + private final static double MAX_QD_BEFORE_FIXING = 35; + private final static double IDEAL_HIGH_QD = 30; + private final static double JITTER_SIGMA = 3; + + public List getKeyNames() { return Arrays.asList("QD"); } + + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); + } + + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java new file mode 100644 index 000000000..44e44c63b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -0,0 +1,119 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + + +/** + * Root Mean Square of the mapping quality of the reads across all samples. + */ +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + + final List qualities = new ArrayList<>(); + if ( stratifiedContexts != null ) { + if ( stratifiedContexts.size() == 0 ) + return null; + + for ( final Map.Entry sample : stratifiedContexts.entrySet() ) { + final AlignmentContext context = sample.getValue(); + for ( final PileupElement p : context.getBasePileup() ) + fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), qualities); + } + } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { + for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() ) + fillMappingQualitiesFromPileup(read.getMappingQuality(), qualities); + } + } + else + return null; + + final double rms = MathUtils.rms(qualities); + return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms)); + } + + private static void fillMappingQualitiesFromPileup(final int mq, final List qualities) { + if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) { + qualities.add(mq); + } + } + + public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } + + public List getDescriptions() { + return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0))); + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java new file mode 100644 index 000000000..13211c44c --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -0,0 +1,264 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.MannWhitneyU; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + + +/** + * Abstract root for all RankSum based annotations + */ +public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { + static final boolean DEBUG = false; + private boolean useDithering = true; + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null + + final GenotypesContext genotypes = vc.getGenotypes(); + if (genotypes == null || genotypes.size() == 0) + return null; + + final ArrayList refQuals = new ArrayList<>(); + final ArrayList altQuals = new ArrayList<>(); + + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + + boolean usePileup = true; + + if ( stratifiedPerReadAlleleLikelihoodMap != null ) { + final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) { + fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, refQuals, altQuals); + usePileup = false; + } + } + + // the old UG SNP-only path through the annotations + if ( usePileup && stratifiedContexts != null ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if ( context != null ) { + final ReadBackedPileup pileup = context.getBasePileup(); + if ( pileup != null ) + fillQualsFromPileup(vc.getAlleles(), pileup, refQuals, altQuals); + } + } + } + + if ( refQuals.isEmpty() && altQuals.isEmpty() ) + return null; + + final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); + for (final Double qual : altQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); + } + for (final Double qual : refQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); + } + + if (DEBUG) { + System.out.format("%s, REF QUALS:", this.getClass().getName()); + for (final Double qual : refQuals) + System.out.format("%4.1f ", qual); + System.out.println(); + System.out.format("%s, ALT QUALS:", this.getClass().getName()); + for (final Double qual : altQuals) + System.out.format("%4.1f ", qual); + System.out.println(); + + } + // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); + + final Map map = new HashMap<>(); + if (!Double.isNaN(testResults.first)) + map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); + return map; + } + + private void fillQualsFromPileup(final List alleles, + final ReadBackedPileup pileup, + final List refQuals, + final List altQuals) { + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + final Double value = getElementForPileupElement(p); + if ( value == null ) + continue; + + if ( alleles.get(0).equals(Allele.create(p.getBase(), true)) ) + refQuals.add(value); + else if ( alleles.contains(Allele.create(p.getBase())) ) + altQuals.add(value); + } + } + } + + private void fillQualsFromLikelihoodMap(final List alleles, + final int refLoc, + final PerReadAlleleLikelihoodMap likelihoodMap, + final List refQuals, + final List altQuals) { + for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if ( ! a.isInformative() ) + continue; // read is non-informative + + final GATKSAMRecord read = el.getKey(); + if ( isUsableRead(read, refLoc) ) { + final Double value = getElementForRead(read, refLoc, a); + if ( value == null ) + continue; + + if ( a.getMostLikelyAllele().isReference() ) + refQuals.add(value); + else if ( alleles.contains(a.getMostLikelyAllele()) ) + altQuals.add(value); + } + } + } + + /** + * Get the element for the given read at the given reference position + * + * @param read the read + * @param refLoc the reference position + * @param mostLikelyAllele the most likely allele for this read + * @return a Double representing the element to be used in the rank sum test, or null if it should not be used + */ + protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) { + return getElementForRead(read, refLoc); + } + + /** + * Get the element for the given read at the given reference position + * + * @param read the read + * @param refLoc the reference position + * @return a Double representing the element to be used in the rank sum test, or null if it should not be used + */ + protected abstract Double getElementForRead(final GATKSAMRecord read, final int refLoc); + + // TODO -- until the ReadPosRankSumTest stops treating these differently, we need to have separate methods for GATKSAMRecords and PileupElements. Yuck. + + /** + * Get the element for the given read at the given reference position + * + * By default this function returns null, indicating that the test doesn't support the old style of pileup calculations + * + * @param p the pileup element + * @return a Double representing the element to be used in the rank sum test, or null if it should not be used + */ + protected Double getElementForPileupElement(final PileupElement p) { + // does not work in pileup mode + return null; + } + + /** + * Can the base in this pileup element be used in comparative tests between ref / alt bases? + * + * Note that this function by default does not allow deletion pileup elements + * + * @param p the pileup element to consider + * @return true if this base is part of a meaningful read for comparison, false otherwise + */ + protected boolean isUsableBase(final PileupElement p) { + return !(p.isDeletion() || + p.getMappingQual() == 0 || + p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || + ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here + } + + /** + * Can the read be used in comparative tests between ref / alt bases? + * + * @param read the read to consider + * @param refLoc the reference location + * @return true if this read is meaningful for comparison, false otherwise + */ + protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) { + return !( read.getMappingQuality() == 0 || + read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ); + } + + /** + * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if + * engine randomization is turned off, and if so does not dither. + * @param walker the walker + * @param toolkit the GATK engine + * @param headerLines the header lines + */ + public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + useDithering = ! toolkit.getArguments().disableDithering; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java new file mode 100644 index 000000000..417f3b595 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -0,0 +1,105 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +/** + * Fraction of reads containing spanning deletions at this site + * + *

Note that this annotation is currently not compatible with HaplotypeCaller.

+ */ +public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if ( stratifiedContexts.size() == 0 ) + return null; + + // not meaningful when we're at an indel location: deletions that start at location N are by definition called at the position N-1, and at position N-1 + // there are no informative deletions in the pileup + if (!vc.isSNP()) + return null; + + int deletions = 0; + int depth = 0; + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + for ( final PileupElement p : sample.getValue().getBasePileup() ) { + depth++; + if ( p.isDeletion() ) + deletions++; + } + } + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); + return map; + } + + public List getKeyNames() { return Arrays.asList("Dels"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Dels", 1, VCFHeaderLineType.Float, "Fraction of Reads Containing Spanning Deletions")); } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java new file mode 100644 index 000000000..ec1c1e729 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; + +import java.util.*; + +/** + * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias + * User: rpoplin + * Date: 8/28/13 + */ + +public class StrandBiasBySample extends GenotypeAnnotation { + + public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; + + @Override + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( ! isAppropriateInput(alleleLikelihoodMap, g) ) + return; + + final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); + + gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); + } + + @Override + public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } + + @Override + public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } + + private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) { + return ! (map == null || g == null || !g.isCalled()); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java new file mode 100644 index 000000000..3da04ef86 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -0,0 +1,536 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.recalibration.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). + * + *

+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating + * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative + * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, + * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical + * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. + * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). + *

+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. + * + *

+ * + *

Input

+ *

+ * The input read data whose base quality scores need to be assessed. + *

+ * A database of known polymorphic sites to skip over. + *

+ * + *

Output

+ *

+ * A GATK Report file with many tables: + *

    + *
  1. The list of arguments
  2. + *
  3. The quantized qualities table
  4. + *
  5. The recalibration table by read group
  6. + *
  7. The recalibration table by quality score
  8. + *
  9. The recalibration table for all the optional covariates
  10. + *
+ * + * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T BaseRecalibrator \
+ *   -I my_reads.bam \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ *   -knownSites another/optional/setOfSitesToMask.vcf \
+ *   -o recal_data.table
+ * 
+ */ + +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class}) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@PartitionBy(PartitionType.READ) +public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { + /** + * all the command line arguments for BQSR and it's covariates + */ + @ArgumentCollection + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + /** + * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag + * to safely access only one table. There may be some CPU cost, but as long as the table is really big + * there should be relatively little CPU costs. + */ + @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) + public boolean lowMemoryMode = false; + + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; + + /** + * an object that keeps track of the information necessary for quality score quantization + */ + private QuantizationInfo quantizationInfo; + + /** + * list to hold the all the covariate objects that were requested (required + standard + experimental) + */ + private Covariate[] requestedCovariates; + + private RecalibrationEngine recalibrationEngine; + + private int minimumQToUse; + + private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; + + private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector + private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation + private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; + + /** + * Parse the -cov arguments and create a list of covariates to be used here + * Based on the covariates' estimates for initial capacity allocate the data hashmap + */ + public void initialize() { + baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty + + if (RAC.FORCE_PLATFORM != null) + RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; + + if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified + throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION); + + if (RAC.LIST_ONLY) { + RecalUtils.listAvailableCovariates(logger); + System.exit(0); + } + RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table + + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + ArrayList requiredCovariates = covariates.getFirst(); + ArrayList optionalCovariates = covariates.getSecond(); + + requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + int covariateIndex = 0; + for (final Covariate covariate : requiredCovariates) + requestedCovariates[covariateIndex++] = covariate; + for (final Covariate covariate : optionalCovariates) + requestedCovariates[covariateIndex++] = covariate; + + logger.info("The covariates being used here: "); + for (Covariate cov : requestedCovariates) { // list all the covariates being used + logger.info("\t" + cov.getClass().getSimpleName()); + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + } + + try { + RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); + } + + initializeRecalibrationEngine(); + minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; + referenceReader = getToolkit().getReferenceDataSource().getReference(); + } + + /** + * Initialize the recalibration engine + */ + private void initializeRecalibrationEngine() { + int numReadGroups = 0; + for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) + numReadGroups += header.getReadGroups().size(); + + recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode); + } + + private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { + return read.getBaseQualities()[offset] < minimumQToUse; + } + + /** + * For each read at this locus get the various covariate values and increment that location in the map based on + * whether or not the base matches the reference at this particular location + */ + public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { + + final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) ); + if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it + + RecalUtils.parsePlatformForRead(read, RAC); + if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls + return 0L; // skip this read completely + } + + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); + final int nErrors = nEvents(isSNP, isInsertion, isDeletion); + + // note for efficiency regions we don't compute the BAQ array unless we actually have + // some error to marginalize over. For ILMN data ~85% of reads have no error + final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read); + + if( baqArray != null ) { // some reads just can't be BAQ'ed + final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates); + final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases + final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); + final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); + final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); + + // aggregate all of the info into our info object, and update the data + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors); + recalibrationEngine.updateDataForRead(info); + return 1L; + } else { + return 0L; + } + } + + /** + * Compute the number of mutational events across all hasEvent vectors + * + * Simply the sum of entries in hasEvents + * + * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event) + * @return the total number of events across all hasEvent arrays + */ + protected static int nEvents(final int[]... hasEvents) { + int n = 0; + for ( final int[] hasEvent : hasEvents ) { + n += MathUtils.sum(hasEvent); + } + return n; + } + + protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { + final byte[] bases = read.getReadBases(); + final boolean[] skip = new boolean[bases.length]; + final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites)); + for( int iii = 0; iii < bases.length; iii++ ) { + skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii); + } + return skip; + } + + protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) { + return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset); + } + + protected static boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { + final int readLength = read.getReadBases().length; + final boolean[] knownSites = new boolean[readLength]; + Arrays.fill(knownSites, false); + for( final Feature f : features ) { + int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureStartOnRead = 0; + } + + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureEndOnRead = readLength; + } + + if( featureStartOnRead > readLength ) { + featureStartOnRead = featureEndOnRead = readLength; + } + + Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true); + } + return knownSites; + } + + // BUGBUG: can be merged with calculateIsIndel + protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) { + final byte[] readBases = read.getReadBases(); + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd()); + final int[] snp = new int[readBases.length]; + int readPos = 0; + int refPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + for( int iii = 0; iii < elementLength; iii++ ) { + snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 ); + readPos++; + refPos++; + } + break; + case D: + case N: + refPos += elementLength; + break; + case I: + case S: // ReferenceContext doesn't have the soft clipped bases! + readPos += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return snp; + } + + protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) { + final int[] indel = new int[read.getReadBases().length]; + int readPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + case S: + { + readPos += elementLength; + break; + } + case D: + { + final int index = ( read.getReadNegativeStrandFlag() ? readPos : readPos - 1 ); + updateIndel(indel, index, mode, EventType.BASE_DELETION); + break; + } + case I: + { + final boolean forwardStrandRead = !read.getReadNegativeStrandFlag(); + if( forwardStrandRead ) { + updateIndel(indel, readPos - 1, mode, EventType.BASE_INSERTION); + } + readPos += elementLength; + if( !forwardStrandRead ) { + updateIndel(indel, readPos, mode, EventType.BASE_INSERTION); + } + break; + } + case N: + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return indel; + } + + private static void updateIndel(final int[] indel, final int index, final EventType mode, final EventType requiredMode) { + if ( mode == requiredMode && index >= 0 && index < indel.length ) + // protect ourselves from events at the start or end of the read (1D3M or 3M1D) + indel[index] = 1; + } + + protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) { + if(errorArray.length != baqArray.length ) { + throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); + } + + final int BLOCK_START_UNSET = -1; + + final double[] fractionalErrors = new double[baqArray.length]; + Arrays.fill(fractionalErrors, 0.0); + boolean inBlock = false; + int blockStartIndex = BLOCK_START_UNSET; + int iii; + for( iii = 0; iii < fractionalErrors.length; iii++ ) { + if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) { + if( !inBlock ) { + fractionalErrors[iii] = (double) errorArray[iii]; + } else { + calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors); + inBlock = false; // reset state variables + blockStartIndex = BLOCK_START_UNSET; // reset state variables + } + } else { + inBlock = true; + if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; } + } + } + if( inBlock ) { + calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors); + } + if( fractionalErrors.length != errorArray.length ) { + throw new ReviewedStingException("Output array length mismatch detected. Malformed read?"); + } + return fractionalErrors; + } + + private static void calculateAndStoreErrorsInBlock( final int iii, + final int blockStartIndex, + final int[] errorArray, + final double[] fractionalErrors ) { + int totalErrors = 0; + for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) { + totalErrors += errorArray[jjj]; + } + for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) { + fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1)); + } + } + + /** + * Create a BAQ style array that indicates no alignment uncertainty + * @param read the read for which we want a BAQ array + * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values + * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above + */ + protected static byte[] flatBAQArray(final GATKSAMRecord read) { + final byte[] baq = new byte[read.getReadLength()]; + Arrays.fill(baq, NO_BAQ_UNCERTAINTY); + return baq; + } + + /** + * Compute an actual BAQ array for read, based on its quals and the reference sequence + * @param read the read to BAQ + * @return a non-null BAQ tag array for read + */ + private byte[] calculateBAQArray( final GATKSAMRecord read ) { + baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); + return BAQ.getBAQTag(read); + } + + /** + * Initialize the reduce step by returning 0L + * + * @return returns 0L + */ + public Long reduceInit() { + return 0L; + } + + /** + * The Reduce method doesn't do anything for this walker. + * + * @param mapped Result of the map. This value is immediately ignored. + * @param sum The summing CountedData used to output the CSV data + * @return returns The sum used to output the CSV data + */ + public Long reduce(Long mapped, Long sum) { + sum += mapped; + return sum; + } + + @Override + public void onTraversalDone(Long result) { + recalibrationEngine.finalizeData(); + + logger.info("Calculating quantized quality scores..."); + quantizeQualityScores(); + + logger.info("Writing recalibration report..."); + generateReport(); + logger.info("...done!"); + + logger.info("BaseRecalibrator was able to recalibrate " + result + " reads"); + } + + private RecalibrationTables getRecalibrationTable() { + return recalibrationEngine.getFinalRecalibrationTables(); + } + + /** + * go through the quality score table and use the # observations and the empirical quality score + * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to + * generate a quantization map (recalibrated_qual -> quantized_qual) + */ + private void quantizeQualityScores() { + quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS); + } + + private void generateReport() { + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..7457acb22 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,292 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + + public ConsensusAlleleCounter(final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for ( Map.Entry sample : contexts.entrySet() ) { + final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedPileup indelPileup = context.getBasePileup(); + insCount += indelPileup.getNumberOfInsertionsAfterThisElement(); + delCount += indelPileup.getNumberOfDeletionsAfterThisElement(); + } + + if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping ) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedPileup indelPileup = context.getBasePileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) { + continue; + } + + for (PileupElement p : indelPileup) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + + if ( p.isBeforeInsertion() ) { + final String insertionBases = p.getBasesOfImmediatelyFollowingInsertion(); + // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB] + if ( insertionBases == null ) + continue; + + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (Map.Entry s : consensusIndelStrings.entrySet()) { + cList.add(new Pair(s.getKey(), s.getValue())); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(insertionBases)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (insertionBases.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(insertionBases,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(insertionBases,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(insertionBases)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (insertionBases.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(insertionBases,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(insertionBases,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(insertionBases)? consensusIndelStrings.get(insertionBases):0; + cList.add(new Pair(insertionBases,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if ( p.isBeforeDeletionStart() ) { + final String deletionString = String.format("D%d",p.getLengthOfImmediatelyFollowingIndel()); + int cnt = consensusIndelStrings.containsKey(deletionString)? consensusIndelStrings.get(deletionString):0; + consensusIndelStrings.put(deletionString,cnt+1); + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference - 1, startIdxInReference + dLen); // add reference padding + + if (Allele.acceptableAlleleBases(refBases, false)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(ref.getBase(), false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + final String insertionBases = (char)ref.getBase() + s; // add reference padding + if (Allele.acceptableAlleleBases(insertionBases, false)) { // don't allow N's in insertions + refAllele = Allele.create(ref.getBase(), true); + altAllele = Allele.create(insertionBases, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.noGenotypes(); + if (doMultiAllelicCalls) { + vcs.add(builder.make()); + if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + break; + } else if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java new file mode 100644 index 000000000..77c51f88b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -0,0 +1,500 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.genotyper.DiploidGenotype; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.List; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; + +/** + * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(G | D) = P(G) * P(D | G) + * + * where + * + * P(D | G) = sum_i log10 P(bi | G) + * + * and + * + * P(bi | G) = 1 - P(error | q1) if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for homozygous genotypes and for heterozygous genotypes: + * + * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT + * + * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. + * + * The priors contain the relative probabilities of each genotype, and must be provided at object creation. + * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above + * model. + */ +public class DiploidSNPGenotypeLikelihoods implements Cloneable { + + public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE; + + protected final static int FIXED_PLOIDY = 2; + protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; + protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); + protected final static double log10_3 = log10(3.0); + + protected boolean VERBOSE = false; + + // + // The fundamental data arrays associated with a Genotype Likelihoods object + // + protected double[] log10Likelihoods = null; + + // TODO: don't calculate this each time through + protected double log10_PCR_error_3; + protected double log10_1_minus_PCR_error; + + /** + * Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype + * + * @param PCR_error_rate the PCR error rate + */ + public DiploidSNPGenotypeLikelihoods(double PCR_error_rate) { + log10_PCR_error_3 = log10(PCR_error_rate) - log10_3; + log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate); + setToZero(); + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone(); + c.log10Likelihoods = log10Likelihoods.clone(); + return c; + } + + protected void setToZero() { + log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros + } + + /** + * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() + * @return likelihoods array + */ + public double[] getLikelihoods() { + return log10Likelihoods; + } + + // ------------------------------------------------------------------------------------- + // + // add() routines. These are the workhorse routines for calculating the overall genotype + // likelihoods given observed bases and reads. Includes high-level operators all the + // way down to single base and qual functions. + // + // ------------------------------------------------------------------------------------- + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup read pileup + * @param ignoreBadBases should we ignore bad bases? + * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? + * @param minBaseQual the minimum base quality at which to consider a base valid + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + int n = 0; + + // for each fragment, add to the likelihoods + FragmentCollection fpile = pileup.toFragments(); + + for ( PileupElement p : fpile.getSingletonReads() ) + n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + + for ( List overlappingPair : fpile.getOverlappingPairs() ) + n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + + return n; + } + + public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + return 0; + + return add(obsBase, qual, (byte)0, (byte)0, 1); + } + + public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + final PileupElement p1 = overlappingPair.get(0); + final PileupElement p2 = overlappingPair.get(1); + + final byte observedBase1 = p1.getBase(); + final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + final byte observedBase2 = p2.getBase(); + final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + + if ( qualityScore1 == 0 ) { + if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases + return 0; + else { + return add(observedBase2, qualityScore2, (byte)0, (byte)0); + } + } else { + return add(observedBase1, qualityScore1, observedBase2, qualityScore2); + } + } + + /** + * + * @param obsBase1 first observed base + * @param qual1 base qual of first observed base + * @param obsBase2 second observed base + * @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment + * @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads + * @return 0 if the base is bad, 1 otherwise + */ + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { + // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine + // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. + // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. + + // Just look up the cached result if it's available, or compute and store it + DiploidSNPGenotypeLikelihoods gl; + if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) { + gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); + } else { + gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); + } + + // for bad bases, there are no likelihoods + if ( gl == null ) + return 0; + + double[] likelihoods = gl.getLikelihoods(); + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double likelihood = likelihoods[g.ordinal()]; + log10Likelihoods[g.ordinal()] += likelihood * nObs; + } + + return 1; + } + + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + return add(obsBase1, qual1, obsBase2, qual2, 1); + } + + // ------------------------------------------------------------------------------------- + // + // Dealing with the cache routines + // + // ------------------------------------------------------------------------------------- + + static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_SAM_QUAL_SCORE +1][BaseUtils.BASES.length+1][QualityUtils.MAX_SAM_QUAL_SCORE +1][MAX_PLOIDY]; + + protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null; + } + + protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy); + if ( gl == null ) + throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d", + observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy)); + return gl; + } + + protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); + setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl); + return gl; + } + + protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][] cache, + byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy, + DiploidSNPGenotypeLikelihoods val ) { + int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); + int j = qualityScore1; + int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; + int l = qualityScore2; + int m = ploidy; + + cache[i][j][k][l][m] = val; + } + + protected DiploidSNPGenotypeLikelihoods getCache(DiploidSNPGenotypeLikelihoods[][][][][] cache, + byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); + int j = qualityScore1; + int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; + int l = qualityScore2; + int m = ploidy; + return cache[i][j][k][l][m]; + } + + protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { + double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); + + try { + + DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone(); + gl.setToZero(); + + // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space + for ( DiploidGenotype g : DiploidGenotype.values() ) { + + // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop + double p_base = 0.0; + p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); + p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); + + final double likelihood = log10(p_base); + gl.log10Likelihoods[g.ordinal()] += likelihood; + } + + if ( VERBOSE ) { + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } + System.out.println(); + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } + System.out.println(); + } + + return gl; + + } catch ( CloneNotSupportedException e ) { + throw new RuntimeException(e); + } + } + + /** + * Updates likelihoods and posteriors to reflect an additional observation of observedBase with + * qualityScore. + * + * @param observedBase1 the base observed on the 1st read of the fragment + * @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA + * @param observedBase2 the base observed on the 2nd read of the fragment + * @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA + * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) + */ + protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { + double[] log10FourBaseLikelihoods = baseZeros.clone(); + + for ( byte trueBase : BaseUtils.BASES ) { + double likelihood = 0.0; + + for ( byte fragmentBase : BaseUtils.BASES ) { + double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3); + if ( qualityScore1 != 0 ) { + log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1); + } + if ( qualityScore2 != 0 ) { + log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2); + } + + //if ( VERBOSE ) { + // System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n", + // observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood); + //} + + likelihood += pow(10, log10FragmentLikelihood); + } + + log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood); + } + + return log10FourBaseLikelihoods; + } + + /** + * + * @param observedBase observed base + * @param chromBase target base + * @param qual base quality + * @return log10 likelihood + */ + protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) { + + double logP; + + if ( observedBase == chromBase ) { + // the base is consistent with the chromosome -- it's 1 - e + //logP = oneMinusData[qual]; + double e = pow(10, (qual / -10.0)); + logP = log10(1.0 - e); + } else { + // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) + logP = qual / -10.0 + (-log10_3); + } + + //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); + return logP; + } + + /** + * Helper function that returns the phred-scaled base quality score we should use for calculating + * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may + * cap the quality score by the mapping quality of the read itself. + * + * @param p Pileup element + * @param ignoreBadBases Should we ignore bad bases? + * @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read? + * @param minBaseQual Minimum allowed base quality + * @return the actual base quality to use + */ + private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) + return 0; + + byte qual = p.getQual(); + + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")"); + if ( capBaseQualsAtMappingQual ) + qual = (byte) Math.min( 0xff & qual, p.getMappingQual()); + if ( (int)qual < minBaseQual ) + qual = (byte)0; + + return qual; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // helper routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Return a string representation of this object in a moderately usable form + * + * @return string representation + */ + public String toString() { + double sum = 0; + StringBuilder s = new StringBuilder(); + for (DiploidGenotype g : DiploidGenotype.values()) { + s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()])); + sum += Math.pow(10,log10Likelihoods[g.ordinal()]); + } + s.append(String.format(" %f", sum)); + return s.toString(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Validation routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + public boolean validate() { + return validate(true); + } + + public boolean validate(boolean throwException) { + try { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + String bad = null; + + int i = g.ordinal(); + if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { + bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); + } + + if ( bad != null ) { + throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + // + // Constant static data + // + private final static double[] genotypeZeros = new double[DiploidGenotype.values().length]; + private final static double[] baseZeros = new double[BaseUtils.BASES.length]; + + static { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + genotypeZeros[g.ordinal()] = 0.0; + } + for ( byte base : BaseUtils.BASES ) { + baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0; + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java new file mode 100644 index 000000000..a57502bc0 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -0,0 +1,342 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/21/11 + * Time: 2:21 PM + * + * This is a site based implementation of an Error Model. The error model is a probability + * distribution for the site given the phred scaled quality. + */ +public class ErrorModel { + private byte maxQualityScore; + private byte minQualityScore; + private byte phredScaledPrior; + private double log10minPower; + private int refDepth; + private boolean hasData = false; + private ProbabilityVector probabilityVector; + private static final boolean compressRange = false; + + private static final double log10MinusE = Math.log10(Math.exp(1.0)); + private static final boolean DEBUG = false; + /** + * Calculates the probability of the data (reference sample reads) given the phred scaled site quality score. + * + * @param UAC Argument Collection + * @param refSamplePileup Reference sample pileup + * @param refSampleVC VC with True alleles in reference sample pileup + */ + public ErrorModel (final UnifiedArgumentCollection UAC, + final ReadBackedPileup refSamplePileup, + VariantContext refSampleVC, final ReferenceContext refContext) { + this.maxQualityScore = UAC.maxQualityScore; + this.minQualityScore = UAC.minQualityScore; + this.phredScaledPrior = UAC.phredScaledPrior; + log10minPower = Math.log10(UAC.minPower); + + PairHMMIndelErrorModel pairModel = null; + LinkedHashMap haplotypeMap = null; + double[][] perReadLikelihoods = null; + + double[] model = new double[maxQualityScore+1]; + Arrays.fill(model,Double.NEGATIVE_INFINITY); + + boolean hasCalledAlleles = false; + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + if (refSampleVC != null) { + + for (Allele allele : refSampleVC.getAlleles()) { + if (allele.isCalled()) { + hasCalledAlleles = true; + break; + } + } + haplotypeMap = new LinkedHashMap(); + if (refSampleVC.isIndel()) { + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); + IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements + } + } + + double p = QualityUtils.qualToErrorProbLog10((byte)(maxQualityScore-minQualityScore)); + if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) { + for (byte q=minQualityScore; q<=maxQualityScore; q++) { + // maximum uncertainty if there's no ref data at site + model[q] = p; + } + this.refDepth = 0; + } + else { + hasData = true; + int matches = 0; + int coverage = 0; + + Allele refAllele = refSampleVC.getReference(); + + if ( refSampleVC.isIndel()) { + //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; + final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); + if (!haplotypeMap.isEmpty()) + perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap); + } + int idx = 0; + for (PileupElement refPileupElement : refSamplePileup) { + if (DEBUG) + System.out.println(refPileupElement.toString()); + boolean isMatch = false; + for (Allele allele : refSampleVC.getAlleles()) { + boolean m = pileupElementMatches(refPileupElement, allele, refAllele, refContext.getBase()); + if (DEBUG) System.out.println(m); + isMatch |= m; + } + if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) { + // ignore match/mismatch if reads, as determined by their likelihood, are not informative + double[] perAlleleLikelihoods = perReadLikelihoods[idx++]; + if (!isInformativeElement(perAlleleLikelihoods)) + matches++; + else + matches += (isMatch?1:0); + + } else { + matches += (isMatch?1:0); + } + coverage++; + } + + int mismatches = coverage - matches; + //System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches); + for (byte q=minQualityScore; q<=maxQualityScore; q++) { + if (coverage==0) + model[q] = p; + else + model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches); + } + this.refDepth = coverage; + } + + // compress probability vector + this.probabilityVector = new ProbabilityVector(model, compressRange); + } + + + @Requires("likelihoods.length>0") + private boolean isInformativeElement(double[] likelihoods) { + // if likelihoods are the same, they're not informative + final double thresh = 0.1; + int maxIdx = MathUtils.maxElementIndex(likelihoods); + int minIdx = MathUtils.minElementIndex(likelihoods); + if (likelihoods[maxIdx]-likelihoods[minIdx]< thresh) + return false; + else + return true; + } + /** + * Simple constructor that just takes a given log-probability vector as error model. + * Only intended for unit testing, not general usage. + * @param pvector Given vector of log-probabilities + * + */ + public ErrorModel(double[] pvector) { + this.maxQualityScore = (byte)(pvector.length-1); + this.minQualityScore = 0; + this.probabilityVector = new ProbabilityVector(pvector, compressRange); + this.hasData = true; + + } + + public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele, byte refBase) { + if (DEBUG) + System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n", + pileupElement.getBase(), pileupElement.isBeforeDeletionStart(), + pileupElement.isBeforeInsertion(),pileupElement.getBasesOfImmediatelyFollowingInsertion(),pileupElement.getLengthOfImmediatelyFollowingIndel(), allele.toString(), refAllele.toString()); + + //pileupElement. + // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch + if (allele.isReference()) { + // for a ref allele, any base mismatch or new indel is a mismatch. + if(allele.getBases().length>0) + // todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location + // could clean up/simplify this when unpadding is removed + return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); + else + // either null allele to compare, or ref/alt lengths are different (indel by definition). + // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch + return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); + } + + // for non-ref alleles to compare: + if (refAllele.getBases().length == allele.getBases().length) + // alleles have the same length (eg snp or mnp) + return pileupElement.getBase() == allele.getBases()[0]; + + // for non-ref alleles, + byte[] alleleBases = allele.getBases(); + int eventLength = alleleBases.length - refAllele.getBases().length; + if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getLengthOfImmediatelyFollowingIndel() == -eventLength) + return true; + + if (eventLength > 0 && pileupElement.isBeforeInsertion() && + Arrays.equals(pileupElement.getBasesOfImmediatelyFollowingInsertion().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't + return true; + + return false; + } + + + /** + * What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches, + * and assuming each match is independent of each other and that the match probability is just dependent of + * the site quality, so p = 10.^-q/10. + * Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high), + * to avoid underflows we'll use the Poisson approximation with lambda = N*p. + * Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above. + * @param q Desired q to get likelihood from + * @param coverage Total coverage + * @param mismatches Number of mismatches + * @return Likelihood of observations as a function of q + */ + @Requires({ + "q >= minQualityScore", + "q <= maxQualityScore", + "coverage >= 0", + "mismatches >= 0", + "mismatches <= coverage" + }) + private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) { + // same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows + double lambda = QualityUtils.qualToErrorProb(q) * (double )coverage; + // log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k) + return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches); + } + + @Requires({"qual-minQualityScore <= maxQualityScore"}) + public double getSiteLogErrorProbabilityGivenQual (int qual) { + return probabilityVector.getLogProbabilityForIndex(qual); + } + + public byte getMaxQualityScore() { + return maxQualityScore; + } + + public byte getMinQualityScore() { + return minQualityScore; + } + + public int getMinSignificantQualityScore() { + return new ProbabilityVector(probabilityVector,true).getMinVal(); + } + + public int getMaxSignificantQualityScore() { + return new ProbabilityVector(probabilityVector,true).getMaxVal(); + } + + public int getReferenceDepth() { + return refDepth; + } + public boolean hasData() { + return hasData; + } + + public ProbabilityVector getErrorModelVector() { + return probabilityVector; + } + + public String toString() { + StringBuilder result = new StringBuilder("("); + boolean skipComma = true; + for (double v : probabilityVector.getProbabilityVector()) { + if (skipComma) { + skipComma = false; + } + else { + result.append(","); + } + result.append(String.format("%.4f", v)); + } + result.append(")"); + return result.toString(); + } + + public static int getTotalReferenceDepth(HashMap perLaneErrorModels) { + int n=0; + for (ErrorModel e : perLaneErrorModels.values()) { + n += e.getReferenceDepth(); + } + return n; + } + + /* +@Requires({"maxAlleleCount >= 0"}) +//todo -- memoize this function + public boolean hasPowerForMaxAC (int maxAlleleCount) { + int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount)); + double log10CumSum = getCumulativeSum(siteQ); + return log10CumSum < log10minPower; + } */ +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java new file mode 100644 index 000000000..530ba3ef8 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -0,0 +1,269 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 5/18/12 + * Time: 10:06 AM + * To change this template use File | Settings | File Templates. + */ +public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods { + final PairHMMIndelErrorModel pairModel; + final LinkedHashMap haplotypeMap; + final ReferenceContext refContext; + final int eventLength; + double[][] readHaplotypeLikelihoods; + + final byte refBase; + final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap; + + public GeneralPloidyIndelGenotypeLikelihoods(final List alleles, + final double[] logLikelihoods, + final int ploidy, + final HashMap perLaneErrorModels, + final boolean ignoreLaneInformation, + final PairHMMIndelErrorModel pairModel, + final LinkedHashMap haplotypeMap, + final ReferenceContext referenceContext, + final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { + super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); + this.pairModel = pairModel; + this.haplotypeMap = haplotypeMap; + this.refContext = referenceContext; + this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles); + // todo - not needed if indel alleles have base at current position + this.refBase = referenceContext.getBase(); + this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap; + } + + // ------------------------------------------------------------------------------------- + // + // add() routines. These are the workhorse routines for calculating the overall genotype + // likelihoods given observed bases and reads. Includes high-level operators all the + // way down to single base and qual functions. + // + // ------------------------------------------------------------------------------------- + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup read pileup + * @param UAC the minimum base quality at which to consider a base valid + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) { + int n = 0; + + if (!hasReferenceSampleData) { + // no error models + return add(pileup, (ErrorModel)null); + } + for (String laneID : perLaneErrorModels.keySet() ) { + // get pileup for this lane + ReadBackedPileup perLanePileup; + if (ignoreLaneInformation) + perLanePileup = pileup; + else + perLanePileup = pileup.getPileupForLane(laneID); + + if (perLanePileup == null || perLanePileup.isEmpty()) + continue; + + ErrorModel errorModel = perLaneErrorModels.get(laneID); + n += add(perLanePileup, errorModel); + if (ignoreLaneInformation) + break; + + } + + return n; + } + + /** + * Calculates the pool's probability for all possible allele counts for all indel alleles observed. + * Calculation is based on the error model + * generated by the reference sample on the same lane. The probability is given by : + * + * Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) * + * Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) * + * [j1 * (1-eq)/2n + eq/3*(2*N-j1) + * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * + * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT + * + * log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) * + * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * + * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT) + * = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q) + * )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N) + * + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N) + * + * Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3) + * + * Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k], + * nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + * + * If pileup data comes from several error models (because lanes can have different error models), + * Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2)) + * = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N + * + * So, for each lane, build error model and combine lanes. + * To store model, can do + * for jA=0:2N + * for jC = 0:2N-jA + * for jG = 0:2N-jA-jC + * for jT = 0:2N-jA-jC-jG + * Q(jA,jC,jG,jT) + * for k = minSiteQual:maxSiteQual + * likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + * + * + * + * where: nA,nC,nG,nT = counts of bases observed in pileup. + * + * + * @param pileup Base pileup + * @param errorModel Site error model + * @return Number of bases added + */ + private int add(ReadBackedPileup pileup, ErrorModel errorModel) { + int n=0; + + // Number of alleless in pileup, in that order + List numSeenBases = new ArrayList(this.alleles.size()); + + if (!hasReferenceSampleData) { + + readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap); + n = readHaplotypeLikelihoods.length; + } else { + Allele refAllele = null; + for (Allele a:alleles) { + numSeenBases.add(0); + if (a.isReference()) + refAllele = a; + } + + if (refAllele == null) + throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!"); + + // count number of elements in pileup + for (PileupElement elt : pileup) { + if (VERBOSE) + System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getBasesOfImmediatelyFollowingInsertion(),elt.getLengthOfImmediatelyFollowingIndel()); + int idx =0; + for (Allele allele : alleles) { + int cnt = numSeenBases.get(idx); + numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele, refBase)?1:0)); + } + + n++; + + } + } + computeLikelihoods(errorModel, alleles, numSeenBases, pileup); + return n; + } + + + + /** + * Compute likelihood of current conformation + * + * @param ACset Count to compute + * @param errorModel Site-specific error model object + * @param alleleList List of alleles + * @param numObservations Number of observations for each allele in alleleList + */ + public void getLikelihoodOfConformation(final ExactACset ACset, + final ErrorModel errorModel, + final List alleleList, + final List numObservations, + final ReadBackedPileup pileup) { + final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size()); + double p1 = 0.0; + + if (!hasReferenceSampleData) { + // no error model: use pair HMM likelihoods + for (int i=0; i < readHaplotypeLikelihoods.length; i++) { + double acc[] = new double[alleleList.size()]; + for (int k=0; k < acc.length; k++ ) + acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY; + p1 += MathUtils.log10sumLog10(acc); + } + + } else { + final int minQ = errorModel.getMinSignificantQualityScore(); + final int maxQ = errorModel.getMaxSignificantQualityScore(); + final double[] acVec = new double[maxQ - minQ + 1]; + + + for (int k=minQ; k<=maxQ; k++) { + int idx=0; + for (int n : numObservations) + acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k]; + } + p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec); + } + ACset.getLog10Likelihoods()[0] = p1; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java new file mode 100644 index 000000000..95d3fb78b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,141 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; + + +/** + * The model representing how we calculate genotype likelihoods + */ +public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { + + public static final String DUMMY_LANE = "Lane1"; + public static final String DUMMY_SAMPLE_NAME = "DummySample1"; + + /* public enum Model { + SNP, + INDEL, + BOTH + } + */ + public enum Model { + SNP, + INDEL, + GENERALPLOIDYSNP, + GENERALPLOIDYINDEL, + BOTH + } + + public enum GENOTYPING_MODE { + /** the Unified Genotyper will choose the most likely alternate allele */ + DISCOVERY, + /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ + GENOTYPE_GIVEN_ALLELES + } + + protected final UnifiedArgumentCollection UAC; + protected Logger logger; + + /** + * Create a new object + * @param logger logger + * @param UAC unified arg collection + */ + protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + if ( logger == null || UAC == null ) throw new ReviewedStingException("Bad arguments"); + this.UAC = UAC; + this.logger = logger; + } + + /** + * Can be overridden by concrete subclasses + * + * @param tracker rod data + * @param ref reference context + * @param contexts stratified alignment contexts + * @param contextType stratified context type + * @param allAllelesToUse the alternate allele to use, null if not set + * @param useBAQedPileup should we use the BAQed pileup or the raw one? + * @param locParser Genome Loc Parser + * @return variant context where genotypes are no-called but with GLs + */ + public abstract VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final List allAllelesToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap); + + + protected int getFilteredDepth(ReadBackedPileup pileup) { + int count = 0; + for ( PileupElement p : pileup ) { + if ( BaseUtils.isRegularBase( p.getBase() ) ) + count++; + } + + return count; + } + +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java new file mode 100644 index 000000000..ae2ea427b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,262 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { + private static final int HAPLOTYPE_SIZE = 80; + + private boolean DEBUG = false; + private boolean ignoreSNPAllelesWhenGenotypingIndels = false; + private PairHMMIndelErrorModel pairModel; + + + private LinkedHashMap haplotypeMap; + + private List alleleList = new ArrayList(); + + + protected IndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, + final Logger logger) { + super(UAC, logger); + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); + DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; + haplotypeMap = new LinkedHashMap(); + ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; + } + + protected static List computeConsensusAlleles(final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final UnifiedArgumentCollection UAC) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); + } + + private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); + + + public VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final List allAllelesToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { + + GenomeLoc loc = ref.getLocus(); +// if (!ref.getLocus().equals(lastSiteVisited)) { + if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { + // starting a new site: clear allele list + haplotypeMap.clear(); + perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods + alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels); + if (alleleList.isEmpty()) + return null; + } + + getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); // will update haplotypeMap adding elements + if (haplotypeMap == null || haplotypeMap.isEmpty()) + return null; + + // start making the VariantContext + // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base. + final int endLoc = loc.getStart() + alleleList.get(0).length() - 1; + final int eventLength = getEventLength(alleleList); + + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList); + + // create the genotypes; no-call everyone for now + GenotypesContext genotypes = GenotypesContext.create(); + final List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); + + // For each sample, get genotype likelihoods based on pileup + // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. + + for (Map.Entry sample : contexts.entrySet()) { + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ + // no likelihoods have been computed for this sample at this site + perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap()); + } + final ReadBackedPileup pileup = context.getBasePileup(); + if (pileup != null) { + final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey())); + b.PL(genotypeLikelihoods); + b.DP(getFilteredDepth(pileup)); + genotypes.add(b.make()); + + if (DEBUG) { + System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); + for (int k = 0; k < genotypeLikelihoods.length; k++) + System.out.format("%1.4f ", genotypeLikelihoods[k]); + System.out.println(); + } + } + } + + return builder.genotypes(genotypes).make(); + } + + public static void getHaplotypeMapFromAlleles(final List alleleList, + final ReferenceContext ref, + final GenomeLoc loc, + final LinkedHashMap haplotypeMap) { + // protect against having an indel too close to the edge of a contig + if (loc.getStart() <= HAPLOTYPE_SIZE) + haplotypeMap.clear(); + // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) + else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) + haplotypeMap.clear(); + else if (alleleList.isEmpty()) + haplotypeMap.clear(); + else { + final int eventLength = getEventLength(alleleList); + final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; + final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; + + if (hsize <= 0) // protect against event lengths larger than ref window sizes + haplotypeMap.clear(); + else + haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), + ref, hsize, numPrefBases)); + } + } + + public static int getEventLength(List alleleList) { + Allele refAllele = alleleList.get(0); + Allele altAllele = alleleList.get(1); + // look for alt allele that has biggest length distance to ref allele + int maxLenDiff = 0; + for (Allele a : alleleList) { + if (a.isNonReference()) { + int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); + if (lenDiff > maxLenDiff) { + maxLenDiff = lenDiff; + altAllele = a; + } + } + } + + return altAllele.getBaseString().length() - refAllele.getBaseString().length(); + + } + + public static List getInitialAlleleList(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final UnifiedArgumentCollection UAC, + final boolean ignoreSNPAllelesWhenGenotypingIndels) { + + List alleles = new ArrayList(); + if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { + VariantContext vc = null; + for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) { + if (vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if (vc == null) + return alleles; + + if (ignoreSNPAllelesWhenGenotypingIndels) { + // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it + for (Allele a : vc.getAlleles()) + if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) + continue; + else + alleles.add(a); + + } else { + alleles.addAll(vc.getAlleles()); + } + + } else { + alleles = computeConsensusAlleles(ref, contexts, contextType, UAC); + } + return alleles; + } + + // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, + // so that per-sample DP will include deletions covering the event. + protected int getFilteredDepth(ReadBackedPileup pileup) { + int count = 0; + for (PileupElement p : pileup) { + if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase())) + count++; + } + + return count; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java new file mode 100644 index 000000000..c5070a76f --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -0,0 +1,844 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.*; + +import java.io.PrintStream; +import java.lang.reflect.Constructor; +import java.util.*; + +public class UnifiedGenotyperEngine { + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + private static final String GPSTRING = "GENERALPLOIDY"; + + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; + + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; + public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; + + private static final int SNP_MODEL = 0; + private static final int INDEL_MODEL = 1; + + public enum OUTPUT_MODE { + /** produces calls only at variant sites */ + EMIT_VARIANTS_ONLY, + /** produces calls at variant sites and confident reference sites */ + EMIT_ALL_CONFIDENT_SITES, + /** produces calls at any callable site regardless of confidence; this argument is intended only for point + * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by + * no means produce a comprehensive set of indels in DISCOVERY mode */ + EMIT_ALL_SITES + } + + // the unified argument collection + private final UnifiedArgumentCollection UAC; + public UnifiedArgumentCollection getUAC() { return UAC; } + + // the annotation engine + private final VariantAnnotatorEngine annotationEngine; + + // the model used for calculating genotypes + private ThreadLocal> glcm = new ThreadLocal>(); + private final List modelsToUse = new ArrayList(2); + + // the model used for calculating p(non-ref) + private ThreadLocal afcm = new ThreadLocal(); + + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; + + // samples in input + private final Set samples; + + // the various loggers and writers + private final Logger logger; + private final PrintStream verboseWriter; + + // number of chromosomes (ploidy * samples) in input + private final int ploidy; + private final int N; + + // the standard filter to use for calls below the confidence threshold but above the emit threshold + private static final Set filter = new HashSet(1); + + private final GenomeLocParser genomeLocParser; + private final boolean BAQEnabledOnCMDLine; + + // --------------------------------------------------------------------------------------------------------- + // + // Public interface functions + // + // --------------------------------------------------------------------------------------------------------- + @Requires({"toolkit != null", "UAC != null"}) + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY); + } + + protected UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, Set samples, UnifiedArgumentCollection UAC) { + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + } + + @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples, int ploidy) { + this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; + genomeLocParser = toolkit.getGenomeLocParser(); + this.samples = new TreeSet(samples); + // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ + this.UAC = UAC; + + this.logger = logger; + this.verboseWriter = verboseWriter; + this.annotationEngine = engine; + + this.ploidy = ploidy; + this.N = samples.size() * ploidy; + log10AlleleFrequencyPriorsSNPs = new double[N+1]; + log10AlleleFrequencyPriorsIndels = new double[N+1]; + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); + + filter.add(LOW_QUAL_FILTER_NAME); + + determineGLModelsToUse(); + + // do argument checking + if (UAC.annotateAllSitesWithPLs) { + if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP)) + throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs"); + + } + } + + /** + * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) + * + * same as the full call but with allSamples == null + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); + } + + + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Set allSamples) { + final List results = new ArrayList(2); + + final List models = getGLModelsToUse(tracker, refContext, rawContext); + + final Map perReadAlleleLikelihoodMap = new HashMap(); + + if ( models.isEmpty() ) { + results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); + } + else { + for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + perReadAlleleLikelihoodMap.clear(); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + if ( stratifiedContexts == null ) { + results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); + } + else { + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); + if ( vc != null ) + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); +// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set +// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) +// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + + } + } + } + + return results; + } + + /** + * Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) + * @return the VariantContext object + */ + public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map perReadAlleleLikelihoodMap) { + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { + return null; + } + + for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + // return the first valid one we encounter + if ( stratifiedContexts != null ) + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); + + } + + return null; + } + + /** + * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc) { + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { + return null; + } + + // return the first one + final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); + } + + /** + * Compute genotypes at a given locus. + * + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(VariantContext vc) { + return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); + } + + + // --------------------------------------------------------------------------------------------------------- + // + // Private implementation helpers + // + // --------------------------------------------------------------------------------------------------------- + + // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine + private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final Map stratifiedContexts, + final AlignmentContextUtils.ReadOrientation type, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + + // initialize the data for this thread if that hasn't been done yet + if ( glcm.get() == null ) { + glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); + } + + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); + } + + private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { + VariantContext vc; + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); + if ( vcInput == null ) + return null; + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()).make(); + } else { + // deal with bad/non-standard reference bases + if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) + return null; + + Set alleles = new HashSet(); + alleles.add(Allele.create(ref.getBase(), true)); + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); + } + + if ( annotationEngine != null ) { + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations + final ReadBackedPileup pileup = rawContext.getBasePileup(); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); + } + + return new VariantCallContext(vc, false); + } + + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); + } + + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model, null); + } + + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map stratifiedContexts, + final VariantContext vc, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); + } + + /** + * Main entry function to calculate genotypes of a given VC with corresponding GL's + * @param tracker Tracker + * @param refContext Reference context + * @param rawContext Raw context + * @param stratifiedContexts Stratified alignment contexts + * @param vc Input VC + * @param model GL calculation model + * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc + * @return VC with assigned genotypes + */ + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { + + boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + + // TODO TODO TODO TODO + // REFACTOR THIS FUNCTION, TOO UNWIELDY!! + + // initialize the data for this thread if that hasn't been done yet + if ( afcm.get() == null ) { + afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); + } + + // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call + if (!canVCbeGenotyped(vc)) { + if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) + return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); + else + return null; + + } + + // estimate our confidence in a reference call and return + if ( vc.getNSamples() == 0 ) { + if ( limitedContext ) + return null; + return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? + estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : + generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); + } + + final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); + + // is the most likely frequency conformation AC=0 for all alternate alleles? + boolean bestGuessIsRef = true; + + // determine which alternate alleles have AF>0 + final List myAlleles = new ArrayList<>(vc.getAlleles().size()); + final List alleleCountsofMLE = new ArrayList<>(vc.getAlleles().size()); + myAlleles.add(vc.getReference()); + for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { + final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); + if ( alternateAllele.isReference() ) + continue; + + // Compute if the site is considered polymorphic with sufficient confidence relative to our + // phred-scaled emission QUAL + final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + final boolean toInclude = isNonRef || alternateAllele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE || + UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || + UAC.annotateAllSitesWithPLs; + + bestGuessIsRef &= !isNonRef; + + if (toInclude) { + myAlleles.add(alternateAllele); + alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); + } + } + + final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); + + // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice + final double phredScaledConfidence = + Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs + ? -10 * AFresult.getLog10PosteriorOfAFEq0() + : -10 * AFresult.getLog10PosteriorOfAFGT0()); + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); + } + + // start constructing the resulting VC + final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) + builder.filters(filter); + + // create the genotypes + final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); + builder.genotypes(genotypes); + + // print out stats if we have a writer + if ( verboseWriter != null && !limitedContext ) + printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + final HashMap attributes = new HashMap(); + + // inherit attributed from input vc if requested + if (inheritAttributesFromInputVC) + attributes.putAll(vc.getAttributes()); + // if the site was downsampled, record that fact + if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) + attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); + + // add the MLE AC and AF annotations + if ( alleleCountsofMLE.size() > 0 ) { + attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); + final int AN = builder.make().getCalledChrCount(); + final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) + for ( int AC : alleleCountsofMLE ) + MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); + attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); + } + + if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { + //final boolean DEBUG_SLOD = false; + + // the overall lod + //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; + final double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + + final List allAllelesToUse = builder.make().getAlleles(); + + // the forward lod + final VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult forwardAFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); + //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + final double forwardLog10PofNull = forwardAFresult.getLog10LikelihoodOfAFEq0(); + final double forwardLog10PofF = forwardAFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); + + // the reverse lod + final VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult reverseAFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); + //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + final double reverseLog10PofNull = reverseAFresult.getLog10LikelihoodOfAFEq0(); + final double reverseLog10PofF = reverseAFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); + + final double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; + final double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; + //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + + // strand score is max bias between forward and reverse strands + double strandScore = Math.max(forwardLod, reverseLod); + // rescale by a factor of 10 + strandScore *= 10.0; + //logger.debug(String.format("SLOD=%f", strandScore)); + + if ( !Double.isNaN(strandScore) ) + attributes.put("SB", strandScore); + } + + // finish constructing the resulting VC + builder.attributes(attributes); + VariantContext vcCall = builder.make(); + + if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations + final ReadBackedPileup pileup = rawContext.getBasePileup(); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); + } + + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync + vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); + + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); + } + + /** + * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. + * @param vc Input VC + * @return Status check + */ + @Requires("vc != null") + protected boolean canVCbeGenotyped(final VariantContext vc) { + // protect against too many alternate alleles that we can't even run AF on: + if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { + logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + + " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); + return false; + } + else return true; + + } + + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { + + if ( !BaseUtils.isRegularBase(refContext.getBase()) ) + return null; + + Map stratifiedContexts = null; + + if ( model.name().contains("INDEL") ) { + + final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); + // don't call when there is no coverage + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + } else if ( model.name().contains("SNP") ) { + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); + + if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { + int numDeletions = 0; + for ( final PileupElement p : rawContext.getBasePileup() ) { + if ( p.isDeletion() ) + numDeletions++; + } + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { + return null; + } + } + } + + return stratifiedContexts; + } + + private final double getRefBinomialProbLog10(final int depth) { + return MathUtils.log10BinomialProbability(depth, 0); + } + + private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { + if ( contexts == null ) + return null; + + double log10POfRef = Math.log10(initialPofRef); + + // for each sample that we haven't examined yet + for ( String sample : samples ) { + final AlignmentContext context = contexts.get(sample); + if ( ignoreCoveredSamples && context != null ) + continue; + final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage(); + log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta); + } + + return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); + } + + /** + * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference + * + * Assumes the sample is diploid + * + * @param depth the depth of the sample + * @param theta the heterozygosity of this species (between 0 and 1) + * @return a valid log10 probability of the sample being hom-ref + */ + @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"}) + @Ensures("MathUtils.goodLog10Probability(result)") + protected double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) { + final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); + return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); + } + + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { + Allele refAllele = null, altAllele = null; + for ( Allele allele : vc.getAlleles() ) { + if ( allele.isReference() ) + refAllele = allele; + else + altAllele = allele; + } + + for (int i = 0; i <= N; i++) { + StringBuilder AFline = new StringBuilder("AFINFO\t"); + AFline.append(pos); + AFline.append("\t"); + AFline.append(refAllele); + AFline.append("\t"); + if ( altAllele != null ) + AFline.append(altAllele); + else + AFline.append("N/A"); + AFline.append("\t"); + AFline.append(i + "/" + N + "\t"); + AFline.append(String.format("%.2f\t", ((float)i)/N)); + AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); + verboseWriter.println(AFline.toString()); + } + + verboseWriter.println("P(f>0) = " + PofF); + verboseWriter.println("Qscore = " + phredScaledConfidence); + verboseWriter.println(); + } + + protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { + return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + } + + protected boolean passesCallThreshold(double conf) { + return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; + } + + protected boolean confidentlyCalled(double conf, double PofF) { + return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING || + (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); + } + + private void determineGLModelsToUse() { + String modelPrefix = ""; + if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) + modelPrefix = GPSTRING; + + // GGA mode => must initialize both the SNP and indel models + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || + UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); + } + else { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); + } + } + + // decide whether we are currently processing SNPs, indels, neither, or both + private List getGLModelsToUse(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) + return modelsToUse; + + if ( modelsToUse.size() != 2 ) + throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse); + + // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested + final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); + + if ( vcInput == null ) { + return Collections.emptyList(); // no work to be done + } else if ( vcInput.isSNP() ) { + return Collections.singletonList(modelsToUse.get(SNP_MODEL)); + } else if ( vcInput.isIndel() || vcInput.isMixed() ) { + return Collections.singletonList(modelsToUse.get(INDEL_MODEL)); + } else { + return Collections.emptyList(); // No support for other types yet + } + } + + /** + * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, + * where Pr(AC=i) = theta/i where theta is heterozygosity + * @param N Number of chromosomes + * @param priors (output) array to be filled with priors + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) + */ + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + + + double sum = 0.0; + + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } + } + + // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) + if (sum > 1.0) { + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); + } + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); + } + + protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + if (model.name().toUpperCase().contains("SNP")) + return log10AlleleFrequencyPriorsSNPs; + else if (model.name().toUpperCase().contains("INDEL")) + return log10AlleleFrequencyPriorsIndels; + else + throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + + } + + protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { + if( model.name().contains("SNP") ) + return HUMAN_SNP_HETEROZYGOSITY; + if( model.name().contains("INDEL") ) + return HUMAN_INDEL_HETEROZYGOSITY; + else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + } + + private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { + + final Map glcm = new HashMap(); + final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); + + for (int i = 0; i < glmClasses.size(); i++) { + final Class glmClass = glmClasses.get(i); + final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); + try { + final Object args[] = new Object[]{UAC,logger}; + final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); + glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); + } + catch (Exception e) { + throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); + } + } + + return glcm; + } + + public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { + if ( tracker == null || ref == null || logger == null ) + return null; + VariantContext vc = null; + + // search for usable record + for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { + if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { + if ( vc == null ) { + vc = vc_input; + } else { + logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); + } + } + } + + return vc; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java new file mode 100644 index 000000000..b778195a9 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -0,0 +1,333 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +public abstract class DiploidExactAFCalc extends ExactAFCalc { + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); + } + + @Override + protected AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final int numAlternateAlleles = vc.getNAlleles() - 1; + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList<>(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap<>(numChr+1); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); + + while ( !ACqueue.isEmpty() ) { + getStateTracker().incNEvaluations(); // keep track of the number of evaluations + + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + + calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); + + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); + } + + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); + } + + + @Override + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse) { + return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); + } + + @Override + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) { + final ArrayList GLs = getGLs(vc.getGenotypes(), true); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + final int alleleLikelihoodIndex1 = alleles.alleleIndex1 - 1; + final int alleleLikelihoodIndex2 = alleles.alleleIndex2 - 1; + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleleLikelihoodIndex1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleleLikelihoodIndex2].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + } + + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + + private double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final int numChr, + final LinkedList ACqueue, + final HashMap indexesToACset, + final double[] log10AlleleFrequencyPriors) { + + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); + + final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + final int numAltAlleles = set.getACcounts().getCounts().length; + + // add conformations for the k+1 case + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele]++; + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList<>(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList<>(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); + } + } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + for ( DependentSet dependent : sameAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also pushes its value to the given callingSetIndex. + private void updateACset(final int[] newSetCounts, + final int numChr, + final ExactACset dependentSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + } + + // push data from the dependency to the new set + //if ( DEBUG ) + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); + } + + private void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double[] log10AlleleFrequencyPriors) { + + set.getLog10Likelihoods()[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; + } + + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; + set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; + } + + double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // update the MLE if necessary + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + + // apply the priors over each alternate allele + for ( final int ACcount : set.getACcounts().getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + } + + private void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; + targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); + } + } + } + + private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** + + // the AX het case + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; + + final int k_i = ACcounts[alleles.alleleIndex1-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles.alleleIndex2-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return allelesToUse.size() == 1 + ? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy) + : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, + assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java new file mode 100644 index 000000000..7b48b3d4d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -0,0 +1,240 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +/** + * Uses the Exact calculation of Heng Li + */ +abstract class ExactAFCalc extends AFCalc { + protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first. + */ + protected static final Comparator LIKELIHOOD_SUM_COMPARATOR = new Comparator() { + + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + return - Double.compare(o1.sum,o2.sum); + } + }; + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first but make sure that + * NON_REF alleles are place are last. + */ + protected static final Comparator LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR = new Comparator() { + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + if (o1.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + return 1; + else if (o2.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + return -1; + else + return o1.compareTo(o2); + } + }; + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with lower alternative allele index are first regardless of + * the likelihood sum. + */ + protected static final Comparator LIKELIHOOD_INDEX_COMPARATOR = new Comparator() { + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + return Integer.compare(o1.index, o2.index); + } + }; + + protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + } + + /** + * Wrapper class that compares two likelihoods associated with two alleles + */ + protected static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public final Allele allele; + public final int index; + + public LikelihoodSum(final Allele allele, final int index) { this.allele = allele; this.index = index; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + /** + * Unpack GenotypesContext into arraylist of doubel values + * @param GLs Input genotype context + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { + final ArrayList genotypeLikelihoods = new ArrayList<>(GLs.size() + 1); + + if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { + if ( sample.hasLikelihoods() ) { + final double[] gls = sample.getLikelihoods().getAsVector(); + + if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + genotypeLikelihoods.add(gls); + } + } + + return genotypeLikelihoods; + } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + // don't try to genotype too many alternate alleles + final List inputAltAlleles = vc.getAlternateAlleles(); + final List outputAltAlleles = reduceScopeAlleles(vc,getMaxAltAlleles()); + + // only if output allele has reduced from the input alt allele set size we should care. + final int altAlleleReduction = inputAltAlleles.size() - outputAltAlleles.size(); + + if (altAlleleReduction == 0) + return vc; + else if (altAlleleReduction != 0) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + + " has " + (vc.getAlternateAlleles().size()) + + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + final List alleles = new ArrayList<>(getMaxAltAlleles() + 1); + alleles.add(vc.getReference()); + alleles.addAll(reduceScopeAlleles(vc, getMaxAltAlleles())); + final VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.alleles(alleles); + builder.genotypes(reduceScopeGenotypes(vc, alleles)); + if (altAlleleReduction < 0) + throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc + " " + builder.make()); + return builder.make(); + } else // if (altAlleleReduction < 0) + throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc); + } + + /** + * Returns a the new set of alleles to use. + * @param vc target variant context. + * @param numAllelesToChoose number of alleles to keep. + * @return the list of alternative allele to keep. + */ + protected List reduceScopeAlleles(final VariantContext vc, final int numAllelesToChoose) { + + // Look for the allele to exclude it from the pruning if present. + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + + final int nonRefAltAlleleIndex = GATKVariantContextUtils.indexOfAltAllele(vc, + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE, false); + final boolean nonRefAltAllelePresent = nonRefAltAlleleIndex >= 0; + + // should not be considered in the downsizing, so we need to count it out when + // considering if alt. allele downsizing is required. + final int numProperOriginalAltAlleles = numOriginalAltAlleles - (nonRefAltAllelePresent ? 1 : 0); + + // Avoid pointless allele reduction: + if (numAllelesToChoose >= numProperOriginalAltAlleles) + return vc.getAlternateAlleles(); + + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + final Allele allele = vc.getAlternateAllele(i); + likelihoodSums[i] = new LikelihoodSum(allele,i); + } + + // Calculate the allele likelihood sums. + reduceScopeCalculateLikelihoodSums(vc, likelihoodSums); + + // sort them by probability mass and choose the best ones + // Make sure that the allele is last if present. + Collections.sort(Arrays.asList(likelihoodSums), nonRefAltAllelePresent ? LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR : LIKELIHOOD_SUM_COMPARATOR); + + // We need to return the best likelihood alleles in the original alternative allele index order. + // This heap will keep track of that index order. + final PriorityQueue mostLikelyAllelesHeapByIndex = new PriorityQueue<>(numOriginalAltAlleles, LIKELIHOOD_INDEX_COMPARATOR); + + for ( int i = 0; i < numAllelesToChoose; i++ ) + mostLikelyAllelesHeapByIndex.add(likelihoodSums[i]); + + // guaranteed no to have been added at this point thanks for checking on whether reduction was + // needed in the first place. + if (nonRefAltAllelePresent) + mostLikelyAllelesHeapByIndex.add(likelihoodSums[nonRefAltAlleleIndex]); + + final ArrayList orderedBestAlleles = new ArrayList<>(numAllelesToChoose); + + while (!mostLikelyAllelesHeapByIndex.isEmpty()) + orderedBestAlleles.add(mostLikelyAllelesHeapByIndex.remove().allele); + + return orderedBestAlleles; + } + + protected static final int PL_INDEX_OF_HOM_REF = 0; + + /** + * Update the likelihood sums with using the variant context genotype likelihoods. + * @param vc source variant context. + * @param likelihoodSums where to update the likelihood sums. + */ + protected abstract void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums); + + /** + * Transforms the genotypes of the variant context according to the new subset of possible alleles. + * + * @param vc original variant-context. + * @param allelesToUse possible alleles. + * @return never {@code null}, the new set of genotype calls for the reduced scope. + */ + protected abstract GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse); +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java new file mode 100644 index 000000000..2978cb8f2 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -0,0 +1,590 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +public class GeneralPloidyExactAFCalc extends ExactAFCalc { + static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them + + private final int ploidy; + + private final static boolean VERBOSE = false; + + protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + this.ploidy = ploidy; + } + + @Override + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse) { + return subsetAlleles(vc,allelesToUse,false,ploidy); + } + + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); + } + + /** + * Simple wrapper class to hold values of combined pool likelihoods. + * For fast hashing and fast retrieval, there's a hash map that shadows main list. + * + */ + static class CombinedPoolLikelihoods { + private LinkedList alleleCountSetList; + private HashMap conformationMap; + private double maxLikelihood; + + + public CombinedPoolLikelihoods() { + // final int numElements = GenotypeLikelihoods.numLikelihoods(); + alleleCountSetList = new LinkedList<>(); + conformationMap = new HashMap<>(); + maxLikelihood = Double.NEGATIVE_INFINITY; + } + + public void add(ExactACset set) { + alleleCountSetList.add(set); + conformationMap.put(set.getACcounts(), set); + final double likelihood = set.getLog10Likelihoods()[0]; + + if (likelihood > maxLikelihood ) + maxLikelihood = likelihood; + + } + + public boolean hasConformation(int[] ac) { + return conformationMap.containsKey(new ExactACcounts(ac)); + + } + + public double getLikelihoodOfConformation(int[] ac) { + return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0]; + } + + public double getGLOfACZero() { + return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list + } + + public int getLength() { + return alleleCountSetList.size(); + } + } + + @Override + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) { + final int numOriginalAltAlleles = likelihoodSums.length; + final ArrayList GLs = getGLs(vc.getGenotypes(), false); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); + // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele + for (int k=1; k < acCount.length;k++) + if (acCount[k] > 0 ) + likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]); + } + } + + /** + * Simple non-optimized version that combines GLs from several pools and produces global AF distribution. + * @param GLs Inputs genotypes context with per-pool GLs + * @param numAlleles Number of alternate alleles + * @param ploidyPerPool Number of samples per pool + * @param log10AlleleFrequencyPriors Frequency priors + */ + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { + + final ArrayList genotypeLikelihoods = getGLs(GLs, true); + + + int combinedPloidy = 0; + + // Combine each pool incrementally - likelihoods will be renormalized at each step + CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods(); + + // first element: zero ploidy, e.g. trivial degenerate distribution + final int[] zeroCounts = new int[numAlleles]; + final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); + set.getLog10Likelihoods()[0] = 0.0; + + combinedPoolLikelihoods.add(set); + + if ( genotypeLikelihoods.size() <= 1 ) { + // no meaningful GLs at all, just set the tracker to non poly values + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); + } else { + for (int p=1; p ACqueue = new LinkedList<>(); + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap<>(); + final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods(); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlleles]; + final int newPloidy = originalPloidy + newGLPloidy; + zeroCounts[0] = newPloidy; + + ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts)); + + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); + + // keep processing while we have AC conformations that need to be calculated + while ( !ACqueue.isEmpty() ) { + getStateTracker().incNEvaluations(); + // compute log10Likelihoods + final ExactACset ACset = ACqueue.remove(); + + calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); + + // clean up memory + indexesToACset.remove(ACset.getACcounts()); + if ( VERBOSE ) + System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); + + } + return newPool; + } + + // todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods + /** + * + * @param set ExactACset holding conformation to be computed + * @param newPool New pool likelihood holder + * @param originalPool Original likelihood holder + * @param newGL New pool GL vector to combine + * @param log10AlleleFrequencyPriors Prior object + * @param originalPloidy Total ploidy of original combined pool + * @param newGLPloidy Ploidy of GL vector + * @param ACqueue Queue of conformations to compute + * @param indexesToACset AC indices of objects in queue + * @return max log likelihood + */ + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { + + // compute likeihood in "set" of new set based on original likelihoods + final int numAlleles = set.getACcounts().getCounts().length; + final int newPloidy = set.getACsum(); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); + + + // add to new pool + if (!Double.isInfinite(log10LofK)) + newPool.add(set); + + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { + return log10LofK; + } + + // iterate over higher frequencies if possible + // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. + // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space + final int ACwiggle = set.getACcounts().getCounts()[0]; + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + + // add conformations for other cases + for ( int allele = 1; allele < numAlleles; allele++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele]++; + // is this a valid conformation? + int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; + ACcountsClone[0] = newPloidy - altSum; + if (ACcountsClone[0] < 0) + continue; + + + GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); + } + + + return log10LofK; + } + + +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } + + /** + * Compute likelihood of a particular AC conformation and update AFresult object + * @param set Set of AC counts to compute + * @param firstGLs Original pool likelihoods before combining + * @param secondGL New GL vector with additional pool + * @param log10AlleleFrequencyPriors Allele frequency priors + * @param numAlleles Number of alleles (including ref) + * @param ploidy1 Ploidy of original pool (combined) + * @param ploidy2 Ploidy of new pool + * @return log-likehood of requested conformation + */ + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { + + final int newPloidy = ploidy1 + ploidy2; + + // sanity check + int totalAltK = set.getACsum(); + if (newPloidy != totalAltK) + throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); + + totalAltK -= set.getACcounts().getCounts()[0]; + // totalAltK has sum of alt alleles of conformation now + + + // special case for k = 0 over all k + if ( totalAltK == 0 ) { // all-ref case + final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; + set.getLog10Likelihoods()[0] = log10Lof0; + + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return log10Lof0; + + } else { + + // initialize result with denominator + // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. + // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i + + int[] currentCount = set.getACcounts().getCounts(); + double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); + + // for current conformation, get all possible ways to break vector K into two components G1 and G2 + final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); + set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY; + while (innerIterator.hasNext()) { + // check if breaking current conformation into g1 and g2 is feasible. + final int[] acCount2 = innerIterator.getCurrentVector(); + final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2); + final int idx2 = innerIterator.getLinearIndex(); + // see if conformation is valid and if original pool had this conformation + // for conformation to be valid, all elements of g2 have to be <= elements of current AC set + if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) { + final double gl2 = secondGL[idx2]; + if (!Double.isInfinite(gl2)) { + final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1); + final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1); + final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); + final double sum = firstGL + gl2 + num1 + num2; + + set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum); + } + } + innerIterator.next(); + } + + set.getLog10Likelihoods()[0] += denom; + } + + double log10LofK = set.getLog10Likelihoods()[0]; + + // update the MLE if necessary + final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); + + // apply the priors over each alternate allele + for (final int ACcount : altCounts ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); + + return log10LofK; + } + + /** + * Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy? + * @param set AC conformation vector + * @param ploidy Ploidy of set + * @return Valid conformation + */ + private static boolean isValidConformation(final int[] set, final int ploidy) { + int sum=0; + for (final int ac: set) { + if (ac < 0) + return false; + sum += ac; + + } + + return (sum == ploidy); + } + + /** + * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, + * including updating the PL's, and assign genotypes accordingly + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes true: assign hard genotypes, false: leave as no-call + * @param ploidy number of chromosomes per sample (pool) + * @return GenotypesContext with new PLs + */ + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + List NO_CALL_ALLELES = new ArrayList<>(ploidy); + + for (int k=0; k < ploidy; k++) + NO_CALL_ALLELES.add(Allele.NO_CALL); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + + // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization + // and subsetting + if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + } + else { + final GenotypeBuilder gb = new GenotypeBuilder(g); + + if ( numNewAltAlleles == 0 ) + gb.noPL(); + else + gb.PL(newLikelihoods); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + gb.alleles(NO_CALL_ALLELES); + else + assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); + newGTs.add(gb.make()); + } + } + + return newGTs; + + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param newLikelihoods the PL array + * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) + * @param numChromosomes Number of chromosomes per pool + */ + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { + final int numNewAltAlleles = allelesToUse.size() - 1; + + + + // find the genotype with maximum likelihoods + final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + + final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); + final ArrayList alleleFreqs = new ArrayList<>(); + final ArrayList alleleCounts = new ArrayList<>(); + + + for (int k=1; k < mlAlleleCount.length; k++) { + alleleCounts.add(mlAlleleCount[k]); + final double freq = (double)mlAlleleCount[k] / (double)numChromosomes; + alleleFreqs.add(freq); + + } + + // per-pool logging of AC and AF + gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + + // remove PLs if necessary + if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) + gb.noPL(); + + ArrayList myAlleles = new ArrayList(); + + // add list of called ML genotypes to alleles list + // TODO - too unwieldy? + int idx = 0; + for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) { + for (int k=0; k < mlAlleleCount[mlind]; k++) + myAlleles.add(idx++,allelesToUse.get(mlind)); + } + gb.alleles(myAlleles); + + // TODO - deprecated so what is the appropriate method to call? + if ( numNewAltAlleles > 0 ) + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java new file mode 100644 index 000000000..ea09f52e8 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -0,0 +1,496 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +/** + * Computes the conditional bi-allelic exact results + * + * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: + * + * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] + * + * it then computes the conditional probability on AF_c == 0: + * + * (2) P(D | AF_t > 0 && AF_c == 0) + * + * Thinking about this visually, we have the following likelihood matrix where each cell is + * the P(D | AF_c == i && AF_t == j): + * + * 0 AF_c > 0 + * ----------------- + * 0 | | + * |--|------------- + * a | | + * f | | + * _ | | + * t | | + * > | | + * 0 | | + * + * What we really want to know how + * + * (3) P(D | AF_c == 0 & AF_t == 0) + * + * compares with + * + * (4) P(D | AF_c > 0 || AF_t > 0) + * + * This is effectively asking for the value in the upper left vs. the sum of all cells. + * + * This class implements the conditional likelihoods summation for any number of alt + * alleles, where each alt allele has its EXACT probability of segregating calculated by + * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * After each allele has its probability calculated we compute the joint posterior + * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i + * prior for the ith least likely allele. + */ + public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + + /** + * The min. confidence of an allele to be included in the joint posterior. + */ + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); + + private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; + private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + /** + * Sorts AFCalcResults by their posteriors of AF > 0, so the + */ + private final static class CompareAFCalcResultsByPNonRef implements Comparator { + @Override + public int compare(AFCalcResult o1, AFCalcResult o2) { + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + } + } + + private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); + + /** + * The AFCalc model we are using to do the bi-allelic computation + */ + final AFCalc biAlleleExactModel; + + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); + } + + /** + * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call + */ + private static class MyAFCalcResult extends AFCalcResult { + /** + * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call + */ + final List supporting; + + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); + this.supporting = supporting; + } + } + + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); + + if ( independentResultTrackers.size() == 0 ) + throw new IllegalStateException("Independent alleles model returned an empty list of results at VC " + vc); + + if ( independentResultTrackers.size() == 1 ) { + // fast path for the very common bi-allelic use case + return independentResultTrackers.get(0); + } else { + // we are a multi-allelic, so we need to actually combine the results + final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); + return combineIndependentPNonRefs(vc, withMultiAllelicPriors); + } + } + + /** + * Compute the conditional exact AFCalcResult for each allele in vc independently, returning + * the result of each, in order of the alt alleles in VC + * + * @param vc the VariantContext we want to analyze, with at least 1 alt allele + * @param log10AlleleFrequencyPriors the priors + * @return a list of the AFCalcResults for each bi-allelic sub context of vc + */ + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + @Ensures("goodIndependentResult(vc, result)") + protected final List computeAlleleIndependentExact(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List results = new LinkedList(); + + for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { + final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); + results.add(resultTracker); + } + + return results; + } + + /** + * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results + */ + private static boolean goodIndependentResult(final VariantContext vc, final List results) { + if ( results.size() != vc.getNAlleles() - 1) return false; + for ( int i = 0; i < results.size(); i++ ) { + if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) + return false; + if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) + return false; + } + + return true; + } + + /** + * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order + * + * @param vc the variant context to split. Must have n.alt.alleles > 1 + * @return a bi-allelic variant context for each alt allele in vc + */ + @Requires({"vc != null", "vc.getNAlleles() > 1"}) + @Ensures("result.size() == vc.getNAlleles() - 1") + protected final List makeAlleleConditionalContexts(final VariantContext vc) { + final int nAltAlleles = vc.getNAlleles() - 1; + + if ( nAltAlleles == 1 ) { + // fast path for bi-allelic case. + return Collections.singletonList(vc); + } else { + // go through the work of ripping up the VC into its biallelic components + final List vcs = new LinkedList(); + + for ( int altI = 0; altI < nAltAlleles; altI++ ) { + vcs.add(biallelicCombinedGLs(vc, altI + 1)); + } + + return vcs; + } + } + + /** + * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex + * + * @param rootVC the root (potentially multi-allelic) variant context + * @param altAlleleIndex index of the alt allele, from 0 == first alt allele + * @return a bi-allelic variant context based on rootVC + */ + @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) + @Ensures({"result.isBiallelic()"}) + protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { + if ( rootVC.isBiallelic() ) { + return rootVC; + } else { + final int nAlts = rootVC.getNAlleles() - 1; + final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); + for ( final Genotype g : rootVC.getGenotypes() ) + biallelicGenotypes.add(combineGLsPrecise(g, altAlleleIndex, nAlts)); + + final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); + final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); + vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); + vcb.genotypes(biallelicGenotypes); + return vcb.make(); + } + } + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case + * + * This is handled in the following way: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + @Deprecated + protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); + final double[] biAllelicPr = new double[3]; + + for ( int index = 0; index < normalizedPr.length; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + + if ( pair.alleleIndex1 == altIndex ) { + if ( pair.alleleIndex2 == altIndex ) + // hom-alt case + biAllelicPr[2] = normalizedPr[index]; + else + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + } else { + if ( pair.alleleIndex2 == altIndex ) + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + else + // hom-non-alt + biAllelicPr[0] += normalizedPr[index]; + } + } + + final double[] GLs = new double[3]; + for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); + + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + + + private static final double PHRED_2_LOG10_COEFF = -.1; + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case. + * + *

Uses the log-sum-exp trick in order to work well with very low PLs

+ * + *

This is handled in the following way:

+ * + *

Suppose we have for a A/B/C site the following GLs:

+ * + *

AA AB BB AC BC CC

+ * + *

and we want to get the bi-allelic GLs for X/B, where X is everything not B

+ * + *

XX = AA + AC + CC (since X = A or C)
+ * XB = AB + BC
+ * BB = BB
+ *

+ *

+ * This implementation use the log sum trick in order to avoid numeric inestability. + *

+ * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + protected Genotype combineGLsPrecise(final Genotype original, final int altIndex, final int nAlts ) { + + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final int[] pls = original.getPL(); + + final int nAlleles = nAlts + 1; + + final int plCount = pls.length; + + double BB = 0; + final double[] XBvalues = new double[nAlleles - 1]; + final double[] XXvalues = new double[plCount - nAlleles]; + + int xbOffset = 0; + int xxOffset = 0; + for ( int index = 0; index < plCount; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + int i = pair.alleleIndex1; + int j = pair.alleleIndex2; + if (i == j) { + if (i == altIndex) BB = PHRED_2_LOG10_COEFF * pls[index]; else XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + } else if (i == altIndex || j == altIndex) + XBvalues[xbOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + else + XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + } + + final double XB = MathUtils.log10sumLog10(XBvalues); + final double XX = MathUtils.log10sumLog10(XXvalues); + + final double[] GLs = new double[] { XX, XB, BB}; + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + final ArrayList sorted = new ArrayList(conditionalPNonRefResults); + + // sort the results, so the most likely allele is first + Collections.sort(sorted, compareAFCalcResultsByPNonRef); + + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); + final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); + + for ( int i = 0; i < sorted.size(); i++ ) { + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); + final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); + } + + return sorted; + } + + /** + * Take the independent estimates of pNonRef for each alt allele and combine them into a single result + * + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc + * + * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently + */ + protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, + final List sortedResultsWithThetaNPriors) { + int nEvaluations = 0; + final int nAltAlleles = sortedResultsWithThetaNPriors.size(); + final int[] alleleCountsOfMLE = new int[nAltAlleles]; + final double[] log10PriorsOfAC = new double[2]; + final Map log10pRefByAllele = new HashMap(nAltAlleles); + + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs + double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; + + boolean anyPoly = false; + for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { + final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); + final int altI = vc.getAlleles().indexOf(altAllele) - 1; + + // MLE of altI allele is simply the MLE of this allele in altAlleles + alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); + + // the AF > 0 case requires us to store the normalized likelihood for later summation + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { + anyPoly = true; + log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + } + + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); + + // trivial -- update the number of evaluations + nEvaluations += sortedResultWithThetaNPriors.nEvaluations; + } + + // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation + if ( ! anyPoly ) { + log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); + } + + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, + // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently + // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + + final double[] log10LikelihoodsOfAC = new double[] { + // L + prior = posterior => L = poster - prior + log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], + log10PosteriorOfACGt0 - log10PriorsOfAC[1] + }; + + return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), + log10pRefByAllele, sortedResultsWithThetaNPriors); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java new file mode 100644 index 000000000..b7a646d4e --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -0,0 +1,538 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +/** + * Helper component to manage active region trimming + * + *

+ * It receives the user arguments that controls trimming and also performs the trimming region calculation. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +class ActiveRegionTrimmer { + + /** + * Genome location parser use in order to create and manipulate genomic intervals. + */ + private GenomeLocParser locParser; + + /** + * Holds the debug flag. If {@code true} the trimmer will output debugging messages into the log. + */ + private boolean debug; + + /** + * Holds the extension to be used based on whether GGA mode is on or off. + */ + private int usableExtension; + + /** + * Records whether the trimming intervals are going to be used to emit reference confidence, {@code true}, + * or regular HC output {@code false}. + */ + private boolean emitReferenceConfidence; + + @Hidden + @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + protected boolean dontTrimActiveRegions = false; + + /** + * the maximum extent into the full active region extension that we're willing to go in genotyping our events + */ + @Hidden + @Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false) + protected int discoverExtension = 25; + + @Hidden + @Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false) + protected int ggaExtension = 300; + + /** + * Include at least this many bases around an event for calling it + */ + @Hidden + @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false) + protected int indelPadding = 150; + + @Hidden + @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) + protected int snpPadding = 20; + + /** + * Holds a reference the trimmer logger. + */ + private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class); + + /** + * Initializes the trimmer. + * + *

+ * This method should be called once and only once before any trimming is performed. + * + * + * @param glp the genome-location-parser to be used when operating with genomic locations. + * @param debug whether to show extra debug log messages. + * @param isGGA whether the trimming region calculator should act as if we are in GGA mode or not. + * @param emitReferenceConfidence indicates whether we plan to use this trimmer to generate trimmed regions + * to be used for emitting reference confidence. + * + * @throws IllegalStateException if this trim calculator has already been initialized. + * @throws IllegalArgumentException if the input location parser is {@code null}. + * @throws UserException.BadArgumentValue if any of the user argument values is invalid. + */ + void initialize(final GenomeLocParser glp, final boolean debug, final boolean isGGA, final boolean emitReferenceConfidence) { + if (locParser != null) + throw new IllegalStateException(getClass().getSimpleName() + " instance initialized twice"); + if (glp == null) + throw new IllegalArgumentException("input genome-loc-parser cannot be null"); + checkUserArguments(); + locParser = glp; + this.debug = debug; + usableExtension = isGGA ? ggaExtension : discoverExtension; + this.emitReferenceConfidence = emitReferenceConfidence; + } + + /** + * Checks user trimming argument values + * + * @throws UserException.BadArgumentValue if there is some problem with any of the arguments values. + */ + private void checkUserArguments() { + if ( snpPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundSNPs","" + snpPadding + "< 0"); + if ( indelPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundIndels","" + indelPadding + "< 0"); + if ( discoverExtension < 0) throw new UserException.BadArgumentValue("maxDiscARExtension","" + discoverExtension + "< 0"); + if ( ggaExtension < 0) throw new UserException.BadArgumentValue("maxGGAAREExtension","" + ggaExtension + "< 0"); + } + + /** + * Holds the result of trimming. + * + * + * + */ + public static class Result { + + /** + * Indicates whether trimming is required per data and user request. + */ + protected final boolean needsTrimming; + + /** + * Holds the input active region. + */ + protected final ActiveRegion originalRegion; + + /** + * Holds the smaller range that contain all relevant callable variants in the + * input active region (not considering the extension). + * + */ + protected final GenomeLoc callableSpan; + + /** + * Maximum available range for the trimmed variant region. + */ + protected final GenomeLoc maximumSpan; + + /** + * The trimmed variant region span including the extension. + */ + protected final GenomeLoc extendedSpan; + + + /** + * The ideal trimmer variant region span including the extension. + */ + protected final GenomeLoc idealSpan; + + /** + * Returns the ideal trimming span. + * + *

+ * The ideal span is the one containing all callable variation overlapping the original active region span + * (without extension) and the applicable padding {@link #getPadding()} in both sides. + * + * + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public GenomeLoc getIdealSpan() { + return idealSpan; + } + + /** + * Holds the flanking spans that do not contain the callable variants. + *

+ * The first element of the pair is the left (up-stream) non-variant flank, whereas the second element is + * the right (down-stream) non-variant flank. + */ + protected final Pair nonVariantFlanks; + + /** + * Holds the collection of callable events within the variant trimming region. + */ + protected final List callableEvents; + + /** + * Required padding around the variant trimming region. + */ + protected final int padding; + + + /** + * Returns the required padding around callable variation. + * + *

+ * Notice that due to the limiting span of the original active region (including its extension) it + * is possible that the resulting final trimmed variant region span does not satisfies the padding. However + * that should be rare. + * + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int getPadding() { + return padding; + } + + /** + * Holds the maximum extension around the original active region span considered for the trimmed + * variation region. + */ + protected final int usableExtension; + + /** + * Returns the maximum extension around the original active region span considered for the trimmed + * variation region. + * + *

+ * From time to time, the trimmed region may require a span beyond the input original active region's. + * For example when there is a callable event close ot one of its ends and the required padding makes it + * round beyond that limit. + * + *

+ * Notice that due to the limiting span of the original active region (including its extended region) it + * is possible that the resulting final trimmed variant region span goes beyond this extension including more of + * the original active region own extension. + * + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int getUsableExtension() { + return usableExtension; + } + + /** + * Holds variant-containing callable region. + *

+ * This is lazy-initialized using {@link #callableSpan}. + */ + protected ActiveRegion callableRegion; + + + /** + * Non-variant left flank region. + *

+ * This is lazy-initialized using + * {@link #nonVariantFlanks}.{@link Pair#getFirst() getFirst()}. + */ + private ActiveRegion leftFlankRegion; + + /** + * Non-variant right flank region. + *

+ * This is lazy-initialized using + * {@link #nonVariantFlanks}.{@link Pair#getFirst() getSecond()}. + */ + private ActiveRegion rightFlankRegion; + + /** + * Whether the variant trimmed region is going to be used for emitting reference confidence records. + */ + private final boolean emitReferenceConfidence; + + /** + * Creates a trimming result given all its properties. + * + * @param emitReferenceConfidence whether reference confidence output modes are on. + * @param needsTrimming whether there is any trimming needed at all. + * @param originalRegion the original active region. + * @param padding padding around contained callable variation events. + * @param extension the extension applied to the trimmed variant span. + * @param overlappingEvents contained callable variation events. + * @param nonVariantFlanks pair of non-variant flank spans around the variant containing span. + * @param extendedSpan final trimmed variant span including the extension. + * @param idealSpan the ideal span, that contains. + * @param maximumSpan maximum possible trimmed span based on the input original active region extended span. + * @param callableSpan variant containing span without padding. + */ + protected Result(final boolean emitReferenceConfidence, final boolean needsTrimming, final ActiveRegion originalRegion, + final int padding, final int extension, + final List overlappingEvents, final Pair nonVariantFlanks, + final GenomeLoc extendedSpan, + final GenomeLoc idealSpan, + final GenomeLoc maximumSpan, + final GenomeLoc callableSpan) { + this.emitReferenceConfidence = emitReferenceConfidence; + this.needsTrimming = needsTrimming; + this.originalRegion = originalRegion; + this.nonVariantFlanks = nonVariantFlanks; + this.padding = padding; + this.usableExtension = extension; + this.callableEvents = overlappingEvents; + this.callableSpan = callableSpan; + this.idealSpan = idealSpan; + this.maximumSpan = maximumSpan; + this.extendedSpan = extendedSpan; + + if (!extendedSpan.isUnmapped() && !callableSpan.isUnmapped() && !extendedSpan.containsP(callableSpan)) + throw new IllegalArgumentException("the extended callable span must include the callable span"); + } + + + /** + * Checks whether there is any variation present in the target region. + * + * @return {@code true} if there is any variant, {@code false} otherwise. + */ + public boolean isVariationPresent() { + return ! callableEvents.isEmpty(); + } + + /** + * Checks whether the active region needs trimming. + */ + public boolean needsTrimming() { + return needsTrimming; + } + + /** + * Returns the trimmed variant containing region + * + * @throws IllegalStateException if there is no variation detected. + * + * @return never {@code null}. + */ + public ActiveRegion getCallableRegion() { + if (callableRegion == null && !extendedSpan.isUnmapped()) + //TODO this conditional is a patch to retain the current standard HC run behaviour + //TODO we should simply remove this difference between trimming with or without GVCF + //TODO embracing slight changes in the standard HC output + callableRegion = emitReferenceConfidence ? originalRegion.trim(callableSpan, extendedSpan) : originalRegion.trim(extendedSpan); + else if (extendedSpan.isUnmapped()) + throw new IllegalStateException("there is no variation thus no variant region"); + return callableRegion; + } + + /** + * Checks whether there is a non-empty left flanking non-variant trimmed out region. + * @return {@code true} if there is a non-trivial left flank region, {@code false} otherwise. + */ + public boolean hasLeftFlankingRegion() { + return ! nonVariantFlanks.getFirst().isUnmapped(); + } + + /** + * Checks whether there is a non-empty right flanking non-variant trimmed out region. + * @return {@code true} if there is a non-trivial right flank region, {@code false} otherwise. + */ + public boolean hasRightFlankingRegion() { + return ! nonVariantFlanks.getSecond().isUnmapped(); + } + + /** + * Returns the trimmed out left non-variant region. + *

+ * Notice that in case of no variation, the whole original region is considered the left flanking region. + * + * @throws IllegalStateException if there is not such as left flanking region. + */ + public ActiveRegion nonVariantLeftFlankRegion() { + if (leftFlankRegion == null && ! nonVariantFlanks.getFirst().isUnmapped()) + leftFlankRegion = originalRegion.trim(nonVariantFlanks.getFirst(),originalRegion.getExtension()); + else if (nonVariantFlanks.getFirst().isUnmapped()) + throw new IllegalStateException("there is no left flank non-variant trimmed out region"); + return leftFlankRegion; + } + + /** + * Returns the trimmed out right non-variant region. + */ + public ActiveRegion nonVariantRightFlankRegion() { + if (rightFlankRegion == null && ! nonVariantFlanks.getSecond().isUnmapped()) + rightFlankRegion = originalRegion.trim(nonVariantFlanks.getSecond(),originalRegion.getExtension()); + else if (nonVariantFlanks.getSecond().isUnmapped()) + throw new IllegalStateException("there is no right flank non-variant trimmed out region"); + return rightFlankRegion; + } + + /** + * Creates a result indicating that there was no trimming to be done. + */ + protected static Result noTrimming(final boolean emitReferenceConfidence, + final ActiveRegion targetRegion, final int padding, + final int usableExtension,final List events) { + final GenomeLoc targetRegionLoc = targetRegion.getLocation(); + final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension,events,new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED), + targetRegionLoc,targetRegionLoc,targetRegionLoc,targetRegionLoc); + result.callableRegion = targetRegion; + return result; + } + + /** + * Creates a result indicating that no variation was found. + */ + protected static Result noVariation(final boolean emitReferenceConfidence, final ActiveRegion targetRegion, + final int padding, final int usableExtension) { + final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension, + Collections.emptyList(),new Pair<>(targetRegion.getLocation(),GenomeLoc.UNMAPPED), + GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED); + result.leftFlankRegion = targetRegion; + return result; + } + } + + /** + * Returns a trimming result object from which the variant trimmed region and flanking non-variant sections + * can be recovered latter. + * + * @param originalRegion the genome location range to trim. + * @param allVariantsWithinExtendedRegion list of variants contained in the trimming location. Variants therein + * not overlapping with {@code originalRegion} are simply ignored. + * @return never {@code null}. + */ + public Result trim(final ActiveRegion originalRegion, + final TreeSet allVariantsWithinExtendedRegion) { + + + if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, + return Result.noVariation(emitReferenceConfidence,originalRegion,snpPadding, usableExtension); + + final List withinActiveRegion = new LinkedList<>(); + final GenomeLoc originalRegionRange = originalRegion.getLocation(); + boolean foundNonSnp = false; + GenomeLoc variantSpan = null; + for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { + final GenomeLoc vcLoc = locParser.createGenomeLoc(vc); + if ( originalRegionRange.overlapsP(vcLoc) ) { + foundNonSnp = foundNonSnp || ! vc.isSNP(); + variantSpan = variantSpan == null ? vcLoc : variantSpan.endpointSpan(vcLoc); + withinActiveRegion.add(vc); + } + } + final int padding = foundNonSnp ? indelPadding : snpPadding; + + // we don't actually have anything in the region after skipping out variants that don't overlap + // the region's full location + if ( variantSpan == null ) + return Result.noVariation(emitReferenceConfidence,originalRegion,padding, usableExtension); + + if ( dontTrimActiveRegions) + return Result.noTrimming(emitReferenceConfidence,originalRegion, padding, usableExtension, withinActiveRegion); + + final GenomeLoc maximumSpan = locParser.createPaddedGenomeLoc(originalRegionRange, usableExtension); + final GenomeLoc idealSpan = locParser.createPaddedGenomeLoc(variantSpan, padding); + final GenomeLoc finalSpan = maximumSpan.intersect(idealSpan).union(variantSpan); + + // Make double sure that, if we are emitting GVCF we won't call non-variable positions beyond the target active region span. + // In regular call we don't do so so we don't care and we want to maintain behavior, so the conditional. + final GenomeLoc callableSpan = emitReferenceConfidence ? variantSpan.intersect(originalRegionRange) : variantSpan; + + final Pair nonVariantRegions = nonVariantTargetRegions(originalRegion, callableSpan); + + if ( debug ) { + logger.info("events : " + withinActiveRegion); + logger.info("region : " + originalRegion); + logger.info("callableSpan : " + callableSpan); + logger.info("padding : " + padding); + logger.info("idealSpan : " + idealSpan); + logger.info("maximumSpan : " + maximumSpan); + logger.info("finalSpan : " + finalSpan); + } + + return new Result(emitReferenceConfidence,true,originalRegion,padding, usableExtension,withinActiveRegion,nonVariantRegions,finalSpan,idealSpan,maximumSpan,variantSpan); + } + + /** + * Calculates the list of region to trim away. + * @param targetRegion region for which to generate the flanking regions. + * @param variantSpan the span of the core region containing relevant variation and required padding. + * @return never {@code null}; 0, 1 or 2 element list. + */ + private Pair nonVariantTargetRegions(final ActiveRegion targetRegion, final GenomeLoc variantSpan) { + final GenomeLoc targetRegionRange = targetRegion.getLocation(); + final int finalStart = variantSpan.getStart(); + final int finalStop = variantSpan.getStop(); + + final int targetStart = targetRegionRange.getStart(); + final int targetStop = targetRegionRange.getStop(); + + final boolean preTrimmingRequired = targetStart < finalStart; + final boolean postTrimmingRequired = targetStop > finalStop; + if (preTrimmingRequired) { + final String contig = targetRegionRange.getContig(); + return postTrimmingRequired ? new Pair<>( + locParser.createGenomeLoc(contig, targetStart, finalStart - 1), + locParser.createGenomeLoc(contig, finalStop + 1, targetStop)) : + new Pair<>(locParser.createGenomeLoc(contig, targetStart, finalStart - 1),GenomeLoc.UNMAPPED); + } else if (postTrimmingRequired) + return new Pair<>(GenomeLoc.UNMAPPED,locParser.createGenomeLoc(targetRegionRange.getContig(), finalStop + 1, targetStop)); + else + return new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java new file mode 100644 index 000000000..8cadea6ec --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java @@ -0,0 +1,543 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.haplotype.EventMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.HaplotypeSizeAndBaseComparator; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.*; + +/** + * Collection of read assembly using several kmerSizes. + * + *

+ * There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those + * assemblies. + *

+ * + *

+ * Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class AssemblyResultSet { + + private final Map assemblyResultByKmerSize; + private final Set haplotypes; + private final Map assemblyResultByHaplotype; + private ActiveRegion regionForGenotyping; + private byte[] fullReferenceWithPadding; + private GenomeLoc paddedReferenceLoc; + private boolean variationPresent; + private Haplotype refHaplotype; + private boolean wasTrimmed = false; + private final CountSet kmerSizes; + private TreeSet variationEvents; + private boolean debug; + private static Logger logger = Logger.getLogger(AssemblyResultSet.class); + + /** + * Constructs a new empty assembly result set. + */ + public AssemblyResultSet() { + assemblyResultByKmerSize = new LinkedHashMap<>(4); + haplotypes = new LinkedHashSet<>(10); + assemblyResultByHaplotype = new LinkedHashMap<>(10); + kmerSizes = new CountSet(4); + } + + + /** + * Change the debug status for this assembly-result-set. + * @param newValue new value for the debug status. + */ + void setDebug(final boolean newValue) { + debug = newValue; + } + + /** + * Trims an assembly result set down based on a new set of trimmed haplotypes. + * + * @param trimmedActiveRegion the trimmed down active region. + * + * @throws NullPointerException if any argument in {@code null} or + * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys. + * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones. + * + * @return never {@code null}, a new trimmed assembly result set. + */ + public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion) { + + final Map originalByTrimmedHaplotypes = calculateOriginalByTrimmedHaplotypes(trimmedActiveRegion); + if (refHaplotype == null) throw new IllegalStateException(); + if (trimmedActiveRegion == null) throw new NullPointerException(); + final AssemblyResultSet result = new AssemblyResultSet(); + + for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) { + final Haplotype original = originalByTrimmedHaplotypes.get(trimmed); + if (original == null) + throw new NullPointerException("all trimmed haplotypes must have an original one"); + final AssemblyResult as = assemblyResultByHaplotype.get(original); + if (as == null) result.add(trimmed); else result.add(trimmed, as); + } + + result.setRegionForGenotyping(trimmedActiveRegion); + result.setFullReferenceWithPadding(this.fullReferenceWithPadding); + result.setPaddedReferenceLoc(this.paddedReferenceLoc); + if (result.refHaplotype == null) + throw new IllegalStateException("missing reference haplotype in the trimmed set"); + result.wasTrimmed = true; + return result; + } + + private Map calculateOriginalByTrimmedHaplotypes(final ActiveRegion trimmedActiveRegion) { + if ( debug ) logger.info("Trimming active region " + getRegionForGenotyping() + " with " + getHaplotypeCount() + " haplotypes"); + + final List haplotypeList = getHaplotypeList(); + + // trim down the haplotypes + final Map originalByTrimmedHaplotypes = new HashMap<>(); + + for ( final Haplotype h : haplotypeList ) { + final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); + + if ( trimmed != null ) { + if (originalByTrimmedHaplotypes.containsKey(trimmed)) { + if (trimmed.isReference()) { + originalByTrimmedHaplotypes.remove(trimmed); + originalByTrimmedHaplotypes.put(trimmed, h); + } + } else + originalByTrimmedHaplotypes.put(trimmed,h); + } else if (h.isReference()) + throw new IllegalStateException("trimming eliminates the reference haplotype"); + else if ( debug ) { + logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + + " because it starts with or ends with an insertion or deletion when trimmed to " + + trimmedActiveRegion.getExtendedLoc()); + } + } + + // create the final list of trimmed haplotypes + final List trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet()); + + // resort the trimmed haplotypes. + Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator()); + final Map sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size()); + for (final Haplotype trimmed : trimmedHaplotypes) + sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed)); + + + if ( debug ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + + haplotypeList.size() + " to only " + trimmedHaplotypes.size()); + if ( debug ) + for ( final Haplotype remaining: trimmedHaplotypes ) + logger.info("Remains: " + remaining + " cigar " + remaining.getCigar()); + return sortedOriginalByTrimmedHaplotypes; + } + + /** + * Query the reference haplotype in the result set. + * @return {@code null} if none wasn't yet added, otherwise a reference haplotype. + */ + public Haplotype getReferenceHaplotype() { + return refHaplotype; + } + + /** + * Checks whether there is any variation present in the assembly result set. + * + *

+ * This is equivalent to whether there is more than one haplotype. + *

+ * + * @return {@code true} if there is variation present, {@code false} otherwise. + */ + public boolean isVariationPresent() { + return variationPresent && haplotypes.size() > 1; + } + + /** + * Dumps debugging information into a print-writer. + * + * @param pw where to dump the information. + * + * @throws NullPointerException if {@code pw} is {@code null}. + */ + public void debugDump(final PrintWriter pw) { + if (getHaplotypeList().size() == 0) { + return; + } + pw.println("Active Region " + this.regionForGenotyping.getLocation()); + pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc()); + pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation()); + pw.println("Haplotype count " + haplotypes.size()); + final Map kmerSizeToCount = new HashMap<>(); + + for (final Map.Entry e : assemblyResultByHaplotype.entrySet()) { + final AssemblyResult as = e.getValue(); + final int kmerSize = as.getGraph().getKmerSize(); + if (kmerSizeToCount.containsKey(kmerSize)) { + kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1); + } else { + kmerSizeToCount.put(kmerSize,1); + } + } + pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() ); + Integer[] kmerSizes = new Integer[kmerSizeToCount.size()]; + kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes); + Arrays.sort(kmerSizes); + pw.println("Kmer sizes values " + Arrays.toString(kmerSizes)); + for (int size : kmerSizes) { + pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size)); + } + } + + /** + * Adds a haplotype to the result set without indicating a generating assembly result. + * + *

+ * It is possible to call this method with the same haplotype several times. In that the second and further + * calls won't have any effect (thus returning {@code false}). + *

+ * + * @param h the haplotype to add to the assembly result set. + * + * @throws NullPointerException if {@code h} is {@code null} + * @throws IllegalArgumentException if {@code h} does not have a genome location. + * + * @return {@code true} if the assembly result set has been modified as a result of this call. + */ + public boolean add(final Haplotype h) { + if (h == null) throw new NullPointerException("input haplotype cannot be null"); + if (h.getGenomeLocation() == null) + throw new IllegalArgumentException("the haplotype provided must have a genomic location"); + if (haplotypes.contains(h)) + return false; + haplotypes.add(h); + updateReferenceHaplotype(h); + return true; + } + + /** + * Adds simultaneously a haplotype and the generating assembly-result. + * + *

+ * Haplotypes and their assembly-result can be added multiple times although just the first call will have + * any effect (return value is {@code true}). + *

+ * + * + * @param h haplotype to add. + * @param ar assembly-result that is assumed to have given rise to that haplotype. + * + * @throws NullPointerException if {@code h} or {@code ar} is {@code null}. + * @throws IllegalArgumentException if {@code h} has not defined genome location. + * + * @return {@code true} iff this called changes the assembly result set. + */ + public boolean add(final Haplotype h, final AssemblyResult ar) { + if (h == null) throw new NullPointerException("input haplotype cannot be null"); + if (ar == null) throw new NullPointerException("input assembly-result cannot be null"); + if (h.getGenomeLocation() == null) + throw new IllegalArgumentException("the haplotype provided must have a genomic location"); + + final boolean assemblyResultAdditionReturn = add(ar); + + if (haplotypes.contains(h)) { + final AssemblyResult previousAr = assemblyResultByHaplotype.get(h); + if (previousAr == null) { + assemblyResultByHaplotype.put(h, ar); + return true; + } else if (!previousAr.equals(ar)) + throw new IllegalStateException("there is already a different assembly result for the input haplotype"); + else + return assemblyResultAdditionReturn; + } else { + haplotypes.add(h); + assemblyResultByHaplotype.put(h,ar); + updateReferenceHaplotype(h); + if (h.isNonReference()) variationPresent = true; + return true; + } + } + + /** + * Add a assembly-result object. + * + * @param ar the assembly result to add. + * + * @throws NullPointerException if {@code ar} is {@code null}. + * @throws IllegalStateException if there is an assembly result with the same kmerSize. + * @return {@code true} iff this addition changed the assembly result set. + */ + public boolean add(final AssemblyResult ar) { + if (ar == null) + throw new NullPointerException(); + final int kmerSize = ar.getKmerSize(); + if (assemblyResultByKmerSize.containsKey(kmerSize)) { + if (!assemblyResultByKmerSize.get(kmerSize).equals(ar)) + throw new IllegalStateException("a different assembly result with the same kmerSize was already added"); + return false; + } else { + assemblyResultByKmerSize.put(kmerSize, ar); + kmerSizes.add(kmerSize); + return true; + } + } + + /** + * Returns the current region for genotyping. + * + * @return might be {@code null}. + */ + public ActiveRegion getRegionForGenotyping() { + return regionForGenotyping; + } + + /** + * Sets the region for genotyping. + * + * @param regionForGenotyping the new value. + */ + public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) { + this.regionForGenotyping = regionForGenotyping; + } + + /** + * Returns the current full reference with padding. + * + * @return might be {@code null}. + */ + public byte[] getFullReferenceWithPadding() { + return fullReferenceWithPadding; + } + + /** + * Sets the full reference with padding base sequence. + * + * @param fullReferenceWithPadding the new value. + */ + public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) { + this.fullReferenceWithPadding = fullReferenceWithPadding; + } + + /** + * Returns the padded reference location. + * + * @return might be {@code null} + */ + public GenomeLoc getPaddedReferenceLoc() { + return paddedReferenceLoc; + } + + /** + * Changes the padded reference location. + * @param paddedReferenceLoc the new value. + */ + public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) { + this.paddedReferenceLoc = paddedReferenceLoc; + } + + /** + * Returns the number of haplotypes in the assembly result set. + * @return {@code 0} or greater. + */ + public int getHaplotypeCount() { + return haplotypes.size(); + } + + /** + * Returns the haplotypes as a list. + * + *

+ * The result is unmodifiable. + *

+ * + * @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly. + */ + public List getHaplotypeList() { + return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()])); + } + + /** + * Returns the maximum kmerSize available. + * + * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. + * + * @return greater than 0. + */ + public int getMaximumKmerSize() { + if (kmerSizes.size() == 0) + throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); + return kmerSizes.max(); + } + + /** + * Indicates whether there are more than one kmerSize in the set. + * + * @return {@code true} iff there is more than one kmerSize assembly in the set. + */ + public boolean hasMultipleKmerSizes() { + return kmerSizes.size() > 1; + } + + /** + * Returns the minimum kmerSize available. + * + * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. + * + * @return greater than 0. + */ + public int getMinimumKmerSize() { + if (kmerSizes.size() == 0) + throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); + return kmerSizes.min(); + } + + /** + * Returns a read-threading graph in the assembly set that has a particular kmerSize. + * + * @param kmerSize the requested kmerSize. + * + * @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize. + */ + public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) { + final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize); + if (assemblyResult == null) return null; + return assemblyResult.getThreadingGraph(); + } + + /** + * Checks whether this assembly result set was trimmed. + * + * @return {@code true} iff this assembly result set was trimmed. + */ + public boolean wasTrimmed() { + return wasTrimmed; + } + + /** + * Marks the assembly as not having variation even if it has more than one haplotype. + */ + public void resetVariationPresent() { + variationPresent = false; + } + + /** + * Dumps debugging information into a logger. + * + * @param logger where to dump the information. + * + * @throws NullPointerException if {@code logger} is {@code null}. + */ + public void debugDump(final Logger logger) { + final StringWriter sw = new StringWriter(); + final PrintWriter pw = new PrintWriter(sw); + debugDump(pw); + final String str = sw.toString(); + final String[] lines = str.split("\n"); + for (final String line : lines) { + if (line.isEmpty()) { + continue; + } + logger.debug(line); + } + } + + /** + * Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the + * reference haplotype and updates {@link #refHaplotype} accordingly. + * + *

+ * This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes} + * I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference + * haplotype and this has already be set. This is the case even if the new haplotypes and the current reference + * are equal. + *

+ * + * @param newHaplotype the new haplotype. + * @throws NullPointerException if {@code newHaplotype} is {@code null}. + * @throws IllegalStateException if there is already a reference haplotype. + */ + private void updateReferenceHaplotype(final Haplotype newHaplotype) { + if (!newHaplotype.isReference()) return; + if (refHaplotype == null) + refHaplotype = newHaplotype; + else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality. + throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different"); + } + + /** + * Returns a sorted set of variant events that best explain the haplotypes found by the assembly + * across kmerSizes. + * + *

+ * The result is sorted incrementally by location. + * + * @return never {@code null}, but perhaps an empty collection. + */ + public TreeSet getVariationEvents() { + if (variationEvents == null) { + final List haplotypeList = getHaplotypeList(); + EventMap.buildEventMapsForHaplotypes(haplotypeList,fullReferenceWithPadding,paddedReferenceLoc,debug); + variationEvents = EventMap.getAllVariantContexts(haplotypeList); + } + return variationEvents; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java new file mode 100644 index 000000000..d65251e58 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -0,0 +1,566 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.DefaultHashMap; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +public class GenotypingEngine { + private final static Logger logger = Logger.getLogger(GenotypingEngine.class); + + private final boolean DEBUG; + private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; + private final static List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied + private final VariantAnnotatorEngine annotationEngine; + private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; + + public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, + final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, + final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { + this.DEBUG = DEBUG; + this.annotationEngine = annotationEngine; + this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; + noCall.add(Allele.NO_CALL); + this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; + } + + /** + * Carries the result of a call to #assignGenotypeLikelihoods + */ + public static class CalledHaplotypes { + private final List calls; + private final Set calledHaplotypes; + + protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { + if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) + throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); + this.calls = calls; + this.calledHaplotypes = calledHaplotypes; + } + + /** + * Get the list of calls made at this location + * @return a non-null (but potentially empty) list of calls + */ + public List getCalls() { + return calls; + } + + /** + * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). + * @return a non-null set of haplotypes + */ + public Set getCalledHaplotypes() { + return calledHaplotypes; + } + } + + /** + * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute + * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling + * + * The list of samples we're working with is obtained from the haplotypeReadMap + * + * @param UG_engine UG Engine with basic input parameters + * @param haplotypes Haplotypes to assign likelihoods to + * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) + * @param perSampleFilteredReadList + * @param ref Reference bytes at active region + * @param refLoc Corresponding active region genome location + * @param activeRegionWindow Active window + * @param genomeLocParser GenomeLocParser + * @param activeAllelesToGenotype Alleles to genotype + * @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts. + * + * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes + * + */ + @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) + @Ensures("result != null") + // TODO - can this be refactored? this is hard to follow! + public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final RefMetaDataTracker tracker, + final List activeAllelesToGenotype, + final boolean emitReferenceConfidence) { + // sanity check input arguments + if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); + if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); + if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); + if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); + if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); + + // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference + // that carry events among the haplotypes + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); + + // Walk along each position in the key set and create each event to be outputted + final Set calledHaplotypes = new HashSet<>(); + final List returnCalls = new ArrayList<>(); + final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0); + + for( final int loc : startPosKeySet ) { + if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); + + if( eventsAtThisLoc.isEmpty() ) { continue; } + + // Create the event mapping object which maps the original haplotype events to the events present at just this locus + final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); + + // Sanity check the priority list for mistakes + final List priorityList = makePriorityList(eventsAtThisLoc); + + // Merge the event to find a common reference representation + + VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); + + if( mergedVC == null ) { continue; } + + final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() + ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL; + + if (emitReferenceConfidence) { + final List alleleList = new ArrayList<>(); + alleleList.addAll(mergedVC.getAlleles()); + alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + vcb.alleles(alleleList); + mergedVC = vcb.make(); + } + + final Map mergeMap = new LinkedHashMap<>(); + mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele + for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { + mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function + } + + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); + + if( DEBUG ) { + logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + } + + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); + + if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap); + + final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); + VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel); + if( call != null ) { + final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : + convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); + if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap_annotations); + final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); + + VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); + + if( call.getAlleles().size() != mergedVC.getAlleles().size() ) + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); + + // maintain the set of all called haplotypes + for ( final Allele calledAllele : call.getAlleles() ) { + final List haplotypeList = alleleMapper.get(calledAllele); + if (haplotypeList == null) continue; + calledHaplotypes.addAll(haplotypeList); + } + + returnCalls.add( annotatedCall ); + } + } + } + + return new CalledHaplotypes(returnCalls, calledHaplotypes); + } + + /** + * Add the allele + * @param stratifiedReadMap target per-read-allele-likelihood-map. + */ + public static Map addMiscellaneousAllele(final Map stratifiedReadMap) { + final Allele miscellanoeusAllele = GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE; + for (Map.Entry perSample : stratifiedReadMap.entrySet()) { + for (Map.Entry> perRead : perSample.getValue().getLikelihoodReadMap().entrySet()) { + double bestLikelihood = Double.NEGATIVE_INFINITY; + double secondBestLikelihood = Double.NEGATIVE_INFINITY; + for (Map.Entry perAllele : perRead.getValue().entrySet()) { + final double value = perAllele.getValue(); + if (value > bestLikelihood) { + secondBestLikelihood = bestLikelihood; + bestLikelihood = value; + } else if (value < bestLikelihood && value > secondBestLikelihood) { + secondBestLikelihood = value; + } + } + final double miscellanousLikelihood = Double.isInfinite(secondBestLikelihood) ? bestLikelihood : secondBestLikelihood; + perSample.getValue().add(perRead.getKey(),miscellanoeusAllele,miscellanousLikelihood); + } + } + return stratifiedReadMap; + } + + /** + * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts + * + * @param haplotypes the list of haplotypes we're working with + * @param haplotypeReadMap map from samples -> the per read allele likelihoods + * @param ref the reference bases (over the same interval as the haplotypes) + * @param refLoc the span of the reference bases + * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) + * @return + */ + private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, + final Map haplotypeReadMap, + final byte[] ref, + final GenomeLoc refLoc, + final List activeAllelesToGenotype) { + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + + if ( in_GGA_mode ) startPosKeySet.clear(); + + //cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently + if ( !in_GGA_mode ) { + // run the event merger if we're not in GGA mode + final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); + if ( mergedAnything ) + cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events + } + + if ( in_GGA_mode ) { + for( final VariantContext compVC : activeAllelesToGenotype ) { + startPosKeySet.add( compVC.getStart() ); + } + } + + return startPosKeySet; + } + + /** + * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view + * @param vcs a list of variant contexts + * @return the list of the sources of vcs in the same order + */ + private List makePriorityList(final List vcs) { + final List priorityList = new LinkedList<>(); + for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); + return priorityList; + } + + private List getVCsAtThisLocation(final List haplotypes, + final int loc, + final List activeAllelesToGenotype) { + // the overlapping events to merge into a common reference view + final List eventsAtThisLoc = new ArrayList<>(); + + if( activeAllelesToGenotype.isEmpty() ) { + for( final Haplotype h : haplotypes ) { + final EventMap eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + } else { // we are in GGA mode! + int compCount = 0; + for( final VariantContext compVC : activeAllelesToGenotype ) { + if( compVC.getStart() == loc ) { + int alleleCount = 0; + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + List alleleSet = new ArrayList<>(2); + alleleSet.add(compVC.getReference()); + alleleSet.add(compAltAllele); + final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; + // check if this event is already in the list of events due to a repeat in the input alleles track + final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); + boolean alreadyExists = false; + for( final VariantContext eventToTest : eventsAtThisLoc ) { + if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { + alreadyExists = true; + } + } + if( !alreadyExists ) { + eventsAtThisLoc.add(candidateEventToAdd); + } + alleleCount++; + } + } + compCount++; + } + } + + return eventsAtThisLoc; + } + + /** + * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele + * @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods + * @param mergedVC Input VC with event to genotype + * @return GenotypesContext object wrapping genotype objects with PLs + */ + @Requires({"alleleReadMap!= null", "mergedVC != null"}) + @Ensures("result != null") + private GenotypesContext calculateGLsForThisEvent( final Map alleleReadMap, final VariantContext mergedVC ) { + final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample + for( final String sample : alleleReadMap.keySet() ) { + final int numHaplotypes = mergedVC.getAlleles().size(); + final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; + final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); + int glIndex = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC + } + } + genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); + } + return genotypes; + } + + private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, + final Map perSampleReadMap, + final Map> perSampleFilteredReadList, + final VariantContext call ) { + + final Map returnMap = new LinkedHashMap<>(); + final GenomeLoc callLoc = parser.createGenomeLoc(call); + for( final Map.Entry sample : perSampleReadMap.entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + + for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end... + for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) { + likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue()); + } + } + } + + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods + for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { + for( final Allele allele : call.getAlleles() ) { + likelihoodMap.add(read, allele, 0.0); + } + } + } + + returnMap.put(sample.getKey(), likelihoodMap); + } + return returnMap; + } + + /** + * Removes symbolic events from list of haplotypes + * @param haplotypes Input/output list of haplotypes, before/after removal + */ + // TODO - split into input haplotypes and output haplotypes as not to share I/O arguments + @Requires("haplotypes != null") + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { + final List haplotypesToRemove = new ArrayList<>(); + for( final Haplotype h : haplotypes ) { + for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { + if( vc.isSymbolic() ) { + for( final Haplotype h2 : haplotypes ) { + for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { + if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events + haplotypesToRemove.add(h); + break; + } + } + } + } + } + } + haplotypes.removeAll(haplotypesToRemove); + } + + // BUGBUG: ugh, too complicated + protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, + final Map> alleleMapper, + final Map perSampleDownsamplingFraction ) { + + final Map alleleReadMap = new LinkedHashMap<>(); + for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele + final List mappedHaplotypes = alleleMapperEntry.getValue(); + for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read + double maxLikelihood = Double.NEGATIVE_INFINITY; + for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele + if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey())) ) { // exact match of haplotype base string + maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); + } + } + perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); + } + } + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling + alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); + } + + return alleleReadMap; + } + + protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { + final Map> alleleMapper = new LinkedHashMap<>(); + for( final Map.Entry entry : mergeMap.entrySet() ) { + alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey()))); + } + return alleleMapper; + } + + @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) + @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) + protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + + final Map> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); + final Event refEvent = new Event(null); + eventMapper.put(refEvent, new ArrayList()); + for( final VariantContext vc : eventsAtThisLoc ) { + eventMapper.put(new Event(vc), new ArrayList()); + } + + for( final Haplotype h : haplotypes ) { + if( h.getEventMap().get(loc) == null ) { + eventMapper.get(refEvent).add(h); + } else { + for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { + if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { + eventMapper.get(new Event(vcAtThisLoc)).add(h); + break; + } + } + } + } + + return eventMapper; + } + + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) + protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final List> alleleMapper, final List haplotypes ) { + if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } + final List eventAllelesForSample = new ArrayList<>(); + for( final Allele a : haplotypeAllelesForSample ) { + final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); + for( int iii = 0; iii < alleleMapper.size(); iii++ ) { + final List mappedHaplotypes = alleleMapper.get(iii); + if( mappedHaplotypes.contains(haplotype) ) { + eventAllelesForSample.add(eventAlleles.get(iii)); + break; + } + } + } + return eventAllelesForSample; + } + + @Deprecated + protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); + } + + protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { + for( final VariantContext vc : list ) { + if( vc.hasSameAllelesAs(vcToTest) ) { + return true; + } + } + return false; + } + + protected static class Event { + public VariantContext vc; + + public Event( final VariantContext vc ) { + this.vc = vc; + } + + @Override + public boolean equals( final Object obj ) { + return obj instanceof Event && ((((Event) obj).vc == null && vc == null) || (((Event) obj).vc != null && vc != null && ((Event) obj).vc.hasSameAllelesAs(vc))) ; + } + + @Override + public int hashCode() { + return (vc == null ? -1 : vc.getAlleles().hashCode()); + } + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java new file mode 100644 index 000000000..66ea7be03 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -0,0 +1,915 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Fast pseudo-likelihood calculation engine based on the assembly haplotype graph. + * + *

+ * An instance is good for active region. {@link GraphBasedLikelihoodCalculationEngine} instance them on demand + * as requested by the {@code HaplotypeCaller} code. + *

+ */ +public class GraphBasedLikelihoodCalculationEngineInstance { + + private final static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngineInstance.class); + + + /** + * Unified kmer size used for the Haplotype graph. + */ + protected final int kmerSize; + + /** + * Reference to the haplotype graph. + */ + protected final HaplotypeGraph haplotypeGraph; + + /** + * Haplotypes included in the haplotype graph. + */ + private final List haplotypes; + + /** + * Whether there is some variation present in the haplotype assembly. + */ + private final boolean hasVariation; + + + /** + * Counts of reads that anchoread somewhere. + * + *

Used for debugging purposes

+ */ + private int anchoredReads = 0; + + /** + * Count of reads that didn't anchor anywere. + * + *

Used for debugging purposes

+ */ + private int nonAnchoredReads = 0; + + /** + * Pair-hmm implementation to use to calculate read likelihoods. + */ + private final FlexibleHMM hmm; + + /** + * Holds the log10 probability of passing from a indel to a match. + */ + private final double indelToMatchTransitionLog10Probability; + + /** + * Maximum likelihood difference between the reference haplotype and the best alternative haplotype. + * + *

If the difference is greater for a read, the reference haplotype likelihood is increase in order to not go + * beyond this limit

+ */ + protected final double log10globalReadMismappingRate; + + protected final EventBlockFinder eventBlockSearchEngine; + + + /** + * Constructs a new engine based on the results of the assembly. + * + * @param assemblyResultSet assembly-result set + * @param hmm fast-hmm implementation to use. + * @param log10globalReadMismappingRate maximum cost for the reference haplotype vs the best alternative available. + * @param heterogeneousKmerSizeResolution multi-kmersize dataset resolution. + * @throws NullPointerException if any argument is null. + * @throws IllegalArgumentException if log10globalReadMismappingRate >= 0. + */ + public GraphBasedLikelihoodCalculationEngineInstance(final AssemblyResultSet assemblyResultSet, final FlexibleHMM hmm, final double log10globalReadMismappingRate, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) { + if (heterogeneousKmerSizeResolution == null) throw new NullPointerException("the kmerSize resolution cannot be null"); + if (assemblyResultSet == null) throw new NullPointerException("the assembly result set cannot be null"); + if (hmm == null) throw new NullPointerException("the fast-hmm component cannot be null"); + if (log10globalReadMismappingRate >= 0) + throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero"); + + this.hmm = hmm; + this.indelToMatchTransitionLog10Probability = QualityUtils.qualToProbLog10(hmm.getGapExtensionPenalty()); + this.log10globalReadMismappingRate = log10globalReadMismappingRate; + + haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList()); + Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); + + // make sure that kmerSize is not bigger than the smallest haplotype. It can well happen when there are cycles and kmerSize inflates. + final Haplotype referenceHaplotype = assemblyResultSet.getReferenceHaplotype(); + int minHaplotypeLength = referenceHaplotype.length(); + for (final Haplotype h : haplotypes) + if (minHaplotypeLength > h.length()) + minHaplotypeLength = h.length(); + + // Determine the kmerSize to use for the unified haplotype assembly graph + + kmerSize = Math.min(minHaplotypeLength, + heterogeneousKmerSizeResolution.useMaximum() ? assemblyResultSet.getMaximumKmerSize() : assemblyResultSet.getMinimumKmerSize()); + + haplotypeGraph = new HaplotypeGraph(kmerSize,haplotypes); + + + if (haplotypeGraph.hasCycles()) + Utils.warnUser(logger, "cycle caused at merging haplotypes with different kmerSizes: active region " + assemblyResultSet.getRegionForGenotyping() + " will be skipped"); + + //TODO haplpotypeGraph.getReferenceSourceVertex() == null + //TODO Is a quick patch to ignore cases where the trimming has rendered kmerSize so big that is bigger than the haplotype + //TODO and reduction to the minimum haplotype size result in no unique kmers. + //TODO the actual solution: we need to impose a maximum trimming at least for Graph-based HC runs as we are loosing + //TODO a bit of sensitivity as trimming results in lack of unique kmers. + if (haplotypeGraph.hasCycles() || haplotypeGraph.getReferenceHaplotype() == null) { + hasVariation = false; + eventBlockSearchEngine = null; + return; + } + + haplotypeGraph.mergeCommonChains(); + //TODO recover dangling ends. Did not work the last time I tried but may be worth to retry. + //haplotypeGraph.recoverDanglingTails(-1); + logger.debug("using haplotype graph with kmerSize " + haplotypeGraph.getKmerSize()); + + hasVariation = !haplotypeGraph.hasCycles() && haplotypeGraph.getHaplotypes().size() > 1; + + eventBlockSearchEngine = new EventBlockFinder(haplotypeGraph); + } + + /** + * Determines whether based on result from assembly and the relevant user options we can reuse th existing + * + * @param assemblyResultSet assembly result set. + * @param kmerSize intended kmerSize for the haplotype graph. + * @param heterogeneousKmerSizeResolution user instruction as to how to resolve situation where we have haplotypes comming from different kmer sizes. + * @return {@code true} iff we can reuse an existing read-threading graph with that kmerSize in the assembly result set. + */ + @SuppressWarnings("unused") + private static boolean canReuseReadThreadingGraphAsHaplotypeGraph(final AssemblyResultSet assemblyResultSet, final int kmerSize, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) { + return !assemblyResultSet.wasTrimmed() && (!assemblyResultSet.hasMultipleKmerSizes() || heterogeneousKmerSizeResolution.combinesKmerSizes()) && + assemblyResultSet.getUniqueReadThreadingGraph(kmerSize) != null; + } + + /** + * Checks whether the underlying haplotype graph assembly contains any variation worth analyzing. + * + * @return {@code true} iff so. + */ + public boolean hasVariation() { + return hasVariation; + } + + /** + * Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the + * active region assembly process. + * + * @param haplotypes to evaluate. + * @param perSampleReadList the input read sets stratified per sample. + * + * @throws NullPointerException if either parameter is {@code null}. + * + * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. + * The value maps can be potentially empty though. + */ + public Map computeReadLikelihoods( + final List haplotypes, + final Map> perSampleReadList) { + // General preparation on the input haplotypes: + Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); + final Map alleleVersions = new LinkedHashMap<>(haplotypes.size()); + for (final Haplotype haplotype : haplotypes) + alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference())); + + // The actual work: + final HashMap result = new HashMap<>(perSampleReadList.size()); + for (final Map.Entry> e : perSampleReadList.entrySet()) { + final String sample = e.getKey(); + final List reads = e.getValue(); + final Set mayNeedAdjustment = new HashSet<>(reads.size()); + // Get the cost/likelihood of each read at relevant subpaths on the tree: + final Map> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment); + // Create the resulting per-read maps: + final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions); + result.put(sample, prallm); + } + logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + ""); + return result; + } + + + /** + * Prints a graph into a dot file. + * + * @param fileName name of the output file. + */ + public void printGraph(final String fileName) { + if (haplotypeGraph != null) + haplotypeGraph.printGraph(fileName); + } + + /** + * Returns the kmerSize the engine is using to match read vs graph kmers thus reducing computation. + * + * @return greater than 0. + */ + public int getKmerSize() { + return kmerSize; + } + + /** + * Tells whether the underlying haplotype graph contained cycles. + * + * @return {@code true} iff so. + */ + public boolean hasCycles() { + // It is set to null if it contained cycles. + return haplotypeGraph == null; + } + + + /** + * Builds the result per-read allele likelihood map. + * + * @param haplotypes haplotypes to process. + * @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex. + * @param alleleVersions map between haplotypes and the corresponding allele. + * @return never {@code null} although perhaps empty. + */ + protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap( + final Collection haplotypes, + final Map> costsEndingByVertex, final Map alleleVersions) { + + final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); + if (haplotypeGraph == null) + return result; + final Map maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10); + final Set supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes()); + supportedHaplotypes.retainAll(haplotypes); + for (final Haplotype haplotype : supportedHaplotypes) + calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex); + + makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk); + applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk); + return result; + } + + /** + * Work done per haplotype to build the result per-read allele likelihood map. + *

+ *

+ * Basically for each haplotype we go through its path in the graph collecting all the read cost that we find + * on the way. For each read present we add up all its cost resulting in a single value per read, i.e. its + * "likelihood". + *

+ * + * @param haplotype the target haplotype + * @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output. + * @param result target where to add the read-vs-haplotype likelihoods. + * @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read. + * @param costsEndingByVertex read costs assorted by their end vertex. + */ + private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype, + final Map alleleVersions, + final PerReadAlleleLikelihoodMap result, + final Map maxAlleleLogLk, + final Map> costsEndingByVertex) { + final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype); + final Set haplotypeVertices = haplotypeRoute.vertexSet(); + final Map readCostByRead = new HashMap<>(); + final Set visitedVertices = new HashSet<>(haplotypeVertices.size()); + final List edgeList = haplotypeRoute.getEdges(); + MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex(); + Route pathSoFar = new Route<>(currentVertex, haplotypeGraph); + final Iterator edgeIterator = edgeList.iterator(); + while (true) { + visitedVertices.add(currentVertex); + final Set finishingAtElementCostSet = costsEndingByVertex.get(currentVertex); + updateReadCosts(readCostByRead, visitedVertices, pathSoFar, finishingAtElementCostSet); + if (!edgeIterator.hasNext()) break; + final MultiSampleEdge nextEdge = edgeIterator.next(); + pathSoFar = new Route<>(pathSoFar, nextEdge); + currentVertex = pathSoFar.getLastVertex(); + } + + final List readCosts = new ArrayList<>(readCostByRead.values()); + Collections.sort(readCosts, ReadCost.COMPARATOR); + for (final ReadCost rc : readCosts) + result.add(rc.read, alleleVersions.get(haplotype), rc.getCost()); + + for (final ReadCost rc : readCosts) { + final Double currentMax = maxAlleleLogLk.get(rc.read); + if (currentMax == null || currentMax < rc.getCost()) + maxAlleleLogLk.put(rc.read, rc.getCost()); + } + } + + /** + * Update the read cost based on the path cost found at a vertex. + * + * @param readCosts collection of read costs so far + * @param visitedVertices visited vertices collection. + * @param pathSoFar the haplotype path visited so far. + * @param finishingAtElementCostSet collection of path cost to process + */ + private void updateReadCosts(final Map readCosts, + final Set visitedVertices, + final Route pathSoFar, + final Set finishingAtElementCostSet) { + if (finishingAtElementCostSet != null) { + for (final ReadSegmentCost pc : finishingAtElementCostSet) { + if (!visitedVertices.contains(pc.path.getFirstVertex())) + continue; + if (!pathSoFar.isSuffix(pc.path)) + continue; + ReadCost rc = readCosts.get(pc.read); + if (rc == null) + readCosts.put(pc.read, rc = new ReadCost(pc.read,indelToMatchTransitionLog10Probability)); + rc.addCost(pc.getCost()); + } + } + } + + /** + * Likelihood penalty for unreported haplotype vs read likelihood with respect to the worst reported one. + */ + private static final int UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY = -3; + + /** + * Re-scales all haplotype vs read likelihoods so that for read, the best haplotype, hash likelihood 0. + * + * @param alleleVersions map between input haplotypes and output alleles. + * @param result where to change the likelihoods. + * @param mayNeedAdjustment set of read that might need adjustment. Others might be ignored. + * @param maxAlternative map from each read and the maximum alternative haplotype likelihood. + */ + @SuppressWarnings("unused") + private void makeLikelihoodAdjustment(final Map alleleVersions, + final PerReadAlleleLikelihoodMap result, + final Set mayNeedAdjustment, + final Map maxAlternative) { + final Map> map = result.getLikelihoodReadMap(); + + for (final GATKSAMRecord read : mayNeedAdjustment) { + final Map existingLikelihoods = map.get(read); + if (existingLikelihoods != null) { + Allele bestAllele = null; + double worstRelativeLikelihood = 0; + double bestRelativeLikelihood = Double.NEGATIVE_INFINITY; + for (final Map.Entry entry : map.get(read).entrySet()) { + final double candidateRelativeLikelihood = entry.getValue(); + if (candidateRelativeLikelihood > bestRelativeLikelihood) { + bestAllele = entry.getKey(); + bestRelativeLikelihood = candidateRelativeLikelihood; + } + if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood) + worstRelativeLikelihood = candidateRelativeLikelihood; + } + + worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY; + if (bestAllele == null) + throw new IllegalStateException("No best allele for read " + read.getReadName()); + final double bestLikelihood = 0.0; // the best becomes zero. + maxAlternative.put(read, bestLikelihood); + for (final Map.Entry entry : alleleVersions.entrySet()) { + final Allele a = entry.getValue(); + final Double relativeLikelihoodO = existingLikelihoods.get(a); + final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO; + final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood; + if (likelihood > 0) + throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName()); + existingLikelihoods.put(a, likelihood); + } + } + } + } + + /** + * Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele. + * The justification of this constraint is explained in + * {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}. + * + * @param alleleVersions correspondence between input haplotypes and output alleles. + * @param result the target result map. + * @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele. + */ + private void applyGlobalReadMismappingRate(final Map alleleVersions, + final PerReadAlleleLikelihoodMap result, + final Map maxAlleleLogLk) { + if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) { + final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype()); + for (final Map.Entry> entry : result.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = entry.getKey(); + final Map likelihoods = entry.getValue(); + final Double maxLogLk = maxAlleleLogLk.get(read); + if (maxAlleleLogLk == null) continue; + final Double referenceLogLk = likelihoods.get(referenceAllele); + final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate; + if (referenceLogLk == null || referenceLogLk < minReferenceLogLk) + likelihoods.put(referenceAllele, minReferenceLogLk); + } + } + } + + /** + * Calculates path costs for a set of reads. + *

+ *

+ * The resulting map has one entry per read, where the read is the key and the value list of path-cost sets. + * Each element in that list corresponds to an event block. Each path cost in one of those sets indicate the + * likelihood (cost) of traversing a possible path across the event block using that read. + *

+ * + * @param reads reads to analyze. + * @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment. + * @return never {@code null}. + */ + protected Map> calculatePathCostsByRead( + final List reads, final Set mayNeedAdjustment) { + final Map> result = new HashMap<>(reads.size()); + if (!hasVariation) + return Collections.emptyMap(); + for (final GATKSAMRecord r : reads) { + calculatePathCostsByRead(r, mayNeedAdjustment, result); + } + return result; + } + + /** + * Calculates path cost for a single read. + * + * @param read target read. + * @param mayNeedAdjustment set where to add read whose likelihood might need adjustment. + * @param result map where to add the result. + */ + private void calculatePathCostsByRead(final GATKSAMRecord read, final Set mayNeedAdjustment, + final Map> result) { + + final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph); + // cannot anchor so go the tradition pair-hmm way. + hmm.loadRead(read); + if (!anchoring.isAnchoredSomewhere()) { + defaultToRegularPairHMM(anchoring, result); + nonAnchoredReads++; + return; + } + + calculateReadSegmentCosts(anchoring, hmm, result); + + if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result); + mayNeedAdjustment.add(read); + anchoredReads++; + } + + /** + * Calculates read vs haplotype likelihoods using the classic PairHMM approach. + *

+ *

+ * It basically compares the read with each haplotype full path without short cuts. + *

+ * + * @param anchoring anchoring information of the read. + * @param destination where to leave the results indexed by ending veretex. + */ + private void defaultToRegularPairHMM(final ReadAnchoring anchoring, final Map> destination) { + + for (final Map.Entry entry : haplotypeGraph.getHaplotypeRouteMap().entrySet()) { + if (entry.getValue() == null) continue; + final byte[] haplotypeBases = entry.getKey().getBases(); + hmm.loadHaplotypeBases(haplotypeBases); + final double cost = hmm.calculateLocalLikelihood(0, anchoring.read.getReadLength(), 0, haplotypeBases.length, false); + final ReadSegmentCost readSegmentCost = new ReadSegmentCost(anchoring.read, entry.getValue(), cost); + addReadSegmentCost(destination, readSegmentCost); + } + } + + /** + * Add a new read-segment-cost to an ending vertex indexed map. + * @param destination where to add the read-segment-cost. + * @param cost the read-segment-cost to add. + */ + private void addReadSegmentCost(final Map> destination, final ReadSegmentCost cost) { + final MultiDeBruijnVertex endVertex = cost.path.getLastVertex(); + Set vpcSet = destination.get(endVertex); + if (vpcSet == null) + destination.put(endVertex, vpcSet = new HashSet<>(10)); + vpcSet.add(cost); + } + + /** + * Calculate the likelihood cost of path section of a read across the graph. + *

+ *

+ * Given a read, its anchors and other unique kmer mapable to the reference path we can divide the graph + * into event blocks: a set of one or more variations and the possible path across that block. + *

+ *

+ *

+ * The result value will have one element fo reach block. Each element is the set of all path costs (likelihoods) + * to traverse the block using all possible paths (different haplotypes). + *

+ *

+ *

+ * The current implementation has some added complexity in order to avoid a situation in where the last part + * of the anchored section of the read is thrown out. We first determine the last event block boundaries and we + * make sure that we won't run over its left limit when covering for earlier event blocks. + *

+ * + * @param anchoring target read graph anchoring information. + * @param hmm the pair-hmm calculation engine. It must have been loaded with the same {@code read} already. + * @param destination where to add the costs. + */ + private void calculateReadSegmentCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) { + + final EventBlockFinder.Traversal traversal = eventBlockSearchEngine.traversal(anchoring); + + for (final EventBlock eventBlock : traversal) { + + // final Set> acrossBlockPaths = + // calculateAllPathsBetweenVertices(anchoring, + // eventBlock.getSource(), eventBlock.getSink());//eventBlock.getRoutesAcross(); + + final Set> acrossBlockPaths = eventBlock.getRoutesAcross(); + + int leftBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSource()); + int rightBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSink()); + calculateCostForPathSet(anchoring.read, acrossBlockPaths, hmm, leftBlockBoundaryIndex, rightBlockBoundaryIndex, true, false, null, null, destination); + } + } + + /** + * Calculate path cost for a set of paths across a event block. + * + * @param read the target read. + * @param acrossBlockPaths event block paths to evaluate. + * @param hmm pair-hmm engine to use to calculate likelihoods. + * @param beforeBlockReadOffset kmer offset on the read for the vertex kmer before the block. + * @param afterBlockReadOffset kmer offset on the read for the vertex kmer after the block. + * @param doClipping whether to perform any clipping in order to save cpu time. + * @param prependVertex if not null, the end cost path with be prepended with this vertex. + * @param appendVertex if not null, the end cost path will be appended with this vertex. + * @param includePathEnds whether to include or exclude the vertices at the very end or beginning of the paths. + */ + private void calculateCostForPathSet( + final GATKSAMRecord read, final Set> acrossBlockPaths, + final FlexibleHMM hmm, final int beforeBlockReadOffset, final int afterBlockReadOffset, + final boolean doClipping, final boolean includePathEnds, + final MultiDeBruijnVertex prependVertex, + final MultiDeBruijnVertex appendVertex, + final Map> destination) { + + + final Set readSegmentCosts = new TreeSet<>(ReadSegmentComparator.INSTANCE); + + final int readStart = beforeBlockReadOffset + kmerSize; + final int readEnd = Math.max(readStart, afterBlockReadOffset + kmerSize - 1); + final byte[][] pathBases = new byte[acrossBlockPaths.size()][]; + final CountSet pathSizes = new CountSet(acrossBlockPaths.size()); + int nextPath = 0; + + // Complete the read segment cost with the corresponding path bases + for (final Route p : acrossBlockPaths) { + final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN); + pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds); + pathSizes.add(readSegmentCost.bases.length); + readSegmentCosts.add(readSegmentCost); + } + + // Add the read 'path size'. + pathSizes.add(readEnd - readStart); + + final byte[] readBases = hmm.getReadBases(); + + // Perform right clipping of bases that are common to all paths and read. + int rightClipping = !doClipping ? 0 : calculateRightClipping(readEnd, pathBases, readBases,pathSizes); + + // Calculate the costs. + for (final ReadSegmentCost readSegmentCost : readSegmentCosts) { + hmm.loadHaplotypeBases(readSegmentCost.bases); + readSegmentCost.setCost(hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false)); + if (prependVertex != null) + readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path); + if (appendVertex != null) + readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex); + addReadSegmentCost(destination,readSegmentCost); + } + + + } + + /** + * Determines how much we can clip away from the right side of a set of path without loosing accuracy when comparing + * likelihood vs the read. + * + * @param readEnd exclusive position right after the last one of the region considered. + * @param pathBases bases of possible path in the same event block. + * @param readBases full length read bases. + * @param pathSizes path size set. + * + * @return 0 or greater. + */ + private int calculateRightClipping(final int readEnd, final byte[][] pathBases, + final byte[] readBases, final CountSet pathSizes) { + final int maxClipping = pathSizes.size() > 1 ? 0 : Math.min(pathSizes.min(), kmerSize - 1); + int rightClipping = 0; + while (rightClipping < maxClipping) { + final byte readBase = readBases[readEnd - rightClipping - 1]; + boolean dontGoFurther = false; + for (int i = 0; !dontGoFurther && i < pathBases.length; i++) + if (pathBases[i][pathBases[i].length - rightClipping - 1] != readBase) + dontGoFurther = true; + if (dontGoFurther) + break; + rightClipping++; + } + return rightClipping; + } + + /** + * Calculates a graph path bases. + *

+ *

+ * When the path starts on a source vertex, all its sequence is considered as part of the path bases. For regular + * vertices start only the suffix (last) base is considered. + *

+ * + * @param path the targeted path. + * @param includePathEnds whether the bases included in the first and last vertex of the path should be included or excluded. + * @return never {@code null} but perhaps a zero-length base array if the final requested path length is zero. + */ + //TODO this method could be moved to the Path class, but require consider how to make the API more concise. + private byte[] eventBlockPathBases(final Path path, + final boolean includePathEnds) { + // We first calculate the size of the return. + final List vertices = path.getVertices(); + final boolean pathStartsAtSource = haplotypeGraph.isSource(path.getFirstVertex()); + final int resultLength = includePathEnds + ? vertices.size() + (pathStartsAtSource ? path.getFirstVertex().getSequence().length - 1 : 0) + : vertices.size() - 2; + // Trivial empty return cases: + if (resultLength <= 0) + return new byte[0]; + final byte[] result = new byte[resultLength]; + if (result.length == 0) { + return result; + } + // General return cases: + final ListIterator it = vertices.listIterator(includePathEnds ? 0 : 1); // skip the vertex (exclusive) + for (int i = 0; i < resultLength; i++) { // i < resultLength implicitly skips the last vertex (exclusive). + final MultiDeBruijnVertex vertex = it.next(); + if (i == 0 && includePathEnds && pathStartsAtSource) { + System.arraycopy(vertex.getSequence(), 0, result, 0, kmerSize); + i = kmerSize - 1; + } else + result[i] = vertex.getSuffix(); + } + return result; + } + + /** + * Calculate the path cost of dangling ends. + *

+ *

+ * A dangling end is the section of the read that falls before the left anchor or after the right anchor. + *

+ * + * @param anchoring anchoring information of the read vs the haplotype assembly graph. + * @param hmm the PairHMM engine to use to calculate likelihoods. + * @param destination cost destination. + */ + private void danglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) { + if (anchoring.leftAnchorIndex > 0 || anchoring.leftAnchorIndex == 0 + && anchoring.leftAnchorVertex.hasAmbiguousSequence()) + leftDanglingEndPathCosts(anchoring, hmm,destination); + + if (anchoring.rightAnchorIndex < anchoring.read.getReadLength() - kmerSize) + rightDanglingEndPathCosts(anchoring, hmm, destination); + } + + /** + * Generates all relevant right dangling end path costs. + * + * @param anchoring the anchoring information for the read under analysis. + * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with + * the same read as {@code anchoring} refers to. + * @param destination where the place the resulting read-segment-costs. + */ + private void rightDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, + final Map> destination) { + final int readStart = anchoring.rightAnchorIndex; + final int readEnd = anchoring.read.getReadLength() - kmerSize + 1; + final Set> haplotypeRoutes = + extendsHaplotypeRoutesForwards(anchoring.rightAnchorVertex); + if (haplotypeRoutes.size() >= 2) + calculateCostForPathSet(anchoring.read, + haplotypeRoutes, hmm, readStart, readEnd, false, true,anchoring.rightAnchorVertex,null,destination); + + } + + /** + * Generates all relevant left dangling end path costs. + * + * @param anchoring the anchoring information for the read under analysis. + * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with + * the same read as {@code anchoring} refers to. + * @param destination where the place the resulting read-segment-costs. + */ + private void leftDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, + final Map> destination) { + final int readStart = -kmerSize; + final int readEnd = anchoring.leftAnchorIndex; + final Set> haplotypeRoutes = + extendsHaplotypeRoutesBackwards(anchoring.leftAnchorVertex); + if (haplotypeRoutes.size() >= 2) // if there is just one haplotype route there is no relevant variation in the dangling end. + calculateCostForPathSet(anchoring.read, haplotypeRoutes, hmm, + readStart, readEnd, false, true, null, anchoring.leftAnchorVertex, destination); + } + + /** + * Construct haplotype routes prefixes to an anchor vertex. + *

+ *

+ * The output should contain a route for each haplotype that includes the input anchor vertex. + * This route would be the prefix of the haplotype that finishes at that vertex. + *

+ * + * @param anchorVertex the target anchor vertex. + * @return never {@code null}. + */ + private Set> extendsHaplotypeRoutesBackwards( + final MultiDeBruijnVertex anchorVertex) { + final Set> result = new HashSet<>(haplotypes.size()); + for (final MultiDeBruijnVertex parent : haplotypeGraph.incomingVerticesOf(anchorVertex)) + extendsHaplotypeRoutesFrom(parent, result, false); + return result; + } + + /** + * Construct haplotype routes suffix from an anchor vertex. + *

+ *

+ * The output should contain a route for each haplotype that includes the input anchor vertex. + * This route would be the suffix of the haplotype that starts at that vertex. + *

+ * + * @param anchorVertex the target anchor vertex. + * @return never {@code null}. + */ + private Set> extendsHaplotypeRoutesForwards( + final MultiDeBruijnVertex anchorVertex) { + final Set> result = new HashSet<>(haplotypes.size()); + for (final MultiDeBruijnVertex parent : haplotypeGraph.outgoingVerticesOf(anchorVertex)) + extendsHaplotypeRoutesFrom(parent, result, true); + return result; + } + + /** + * Extends from a vertex considering path furcations that are part of some valid haplotype + *

+ *

+ * In other words, it will ignore subpaths that are not valid part of an assembled haplotype. + *

+ * + * @param start start seed vertex. + * @param result destination for found extensions. + * @param forward whether to traverse edges forward or backwards. + */ + private void extendsHaplotypeRoutesFrom(final MultiDeBruijnVertex start, final Set> result, final boolean forward) { + final Set validHaplotypeRoutes = haplotypeGraph.getEnclosingHaplotypeRoutes(start); + if (validHaplotypeRoutes.size() == 0) return; + final Deque, Set>> queue = new LinkedList<>(); + queue.add(new Pair<>(new Route<>(start, haplotypeGraph), validHaplotypeRoutes)); + while (!queue.isEmpty()) { + final Pair, Set> current = queue.remove(); + final Route path = current.getFirst(); + final MultiDeBruijnVertex vertex = forward ? path.getLastVertex() : path.getFirstVertex(); + final Set validRoutes = current.getSecond(); + for (final HaplotypeRoute hr : validRoutes) { + final MultiDeBruijnVertex routeEndVertex = forward ? hr.getLastVertex() : hr.getFirstVertex(); + if (vertex.equals(routeEndVertex)) { + result.add(path); + break; + } + } + final Set nextVertices = forward ? haplotypeGraph.outgoingVerticesOf(vertex) : + haplotypeGraph.incomingVerticesOf(vertex); + for (final MultiDeBruijnVertex candidate : nextVertices) { + extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(forward, queue, path, validRoutes, candidate); + } + } + } + + /** + * Check on an candidate vertice to exted a path. + * + *

+ * This method updates the traversal queue accordingly. + *

+ * + * @param forward whether the extension is forward, or backwards. + * @param queue queue with open paths yet to be explored. + * @param path path extension to evaluate. + * @param validRoutes collection of valid haplotype routes used to discard non-informative extensions. + * @param candidate the candidate extending vertex. + */ + private void extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex( + final boolean forward, + final Deque, Set>> queue, + final Route path, + final Set validRoutes, final MultiDeBruijnVertex candidate) { + final Set parentValidHaplotypes = haplotypeGraph.getEnclosingHaplotypeRoutes(candidate); + switch (parentValidHaplotypes.size()) { + case 0: + return; + case 1: + if (validRoutes.containsAll(parentValidHaplotypes)) + queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes)); + else + return; + break; + default: + if (parentValidHaplotypes.size() == validRoutes.size() && parentValidHaplotypes.containsAll(validRoutes)) { + queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes)); + } else { + final Set newValidHaplotypeRoutes = new HashSet<>(validRoutes.size()); + for (final HaplotypeRoute hr : validRoutes) + if (parentValidHaplotypes.contains(hr)) + newValidHaplotypeRoutes.add(hr); + if (newValidHaplotypeRoutes.size() == 0) + return; + queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), newValidHaplotypeRoutes)); + } + } + } + + public List getHaplotypeList() { + return new ArrayList<>(haplotypeGraph.getHaplotypes()); + } + + /** + * Returns the haplotype graph associated with this instance. + * @return never {@code null} + */ + public HaplotypeGraph getHaplotypeGraph() { + return haplotypeGraph; + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java new file mode 100644 index 000000000..91e763a0d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -0,0 +1,1169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.gvcf.GVCFWriter; +import org.broadinstitute.sting.utils.haplotype.*; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. + * + *

Input

+ *

+ * Input bam file(s) from which to make calls + *

+ * + *

Output

+ *

+ * VCF file with raw, unrecalibrated SNP and indel calls. + *

+ * + *

Examples

+ *
+ *   java
+ *     -jar GenomeAnalysisTK.jar
+ *     -T HaplotypeCaller
+ *     -R reference/human_g1k_v37.fasta
+ *     -I sample1.bam [-I sample2.bam ...] \
+ *     --dbsnp dbSNP.vcf \
+ *     -stand_call_conf [50.0] \
+ *     -stand_emit_conf 10.0 \
+ *     [-L targets.interval_list]
+ *     -o output.raw.snps.indels.vcf
+ * 
+ * + *

Caveats

+ *
    + *
  • The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.
  • + *
+ * + * @author rpoplin + * @since 8/22/11 + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@PartitionBy(PartitionType.LOCUS) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) +@ActiveRegionTraversalParameters(extension=100, maxRegion=300) +@ReadFilters({HCMappingQualityFilter.class}) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) +public class HaplotypeCaller extends ActiveRegionWalker, Integer> implements AnnotatorCompatible, NanoSchedulable { + // ----------------------------------------------------------------------------------------------- + // general haplotype caller arguments + // ----------------------------------------------------------------------------------------------- + + /** + * A raw, unfiltered, highly sensitive callset in VCF format. + */ + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + @Hidden + @Advanced + @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", + doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) + protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM; + + @Hidden + @Advanced + @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false) + protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN; + + @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) + protected PrintStream graphWriter = null; + + /** + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. + * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. + * + * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to + * use in specific areas where you want to better understand why the HC is making specific calls. + * + * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches + * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended + * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more + * easily see which reads go with these haplotype. + * + * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire + * active region, coming from read HC and a special read group. + * + * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean + * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to + * its next best haplotype. + * + * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, + * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV + * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen + * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png + * + */ + @Advanced + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) + protected StingSAMFileWriter bamWriter = null; + private HaplotypeBAMWriter haplotypeBAMWriter; + + /** + * The type of BAM output we want to see. + */ + @Advanced + @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) + public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + private double log10GlobalReadMismappingRate; + + /** + * Active region trimmer reference. + */ + @ArgumentCollection + protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer(); + + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Advanced + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + /** + * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); + + /** + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Advanced + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ + @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) + protected String[] annotationClassesToUse = { "Standard" }; + + @ArgumentCollection + private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + + // ----------------------------------------------------------------------------------------------- + // arguments to control internal behavior of the read threading assembler + // ----------------------------------------------------------------------------------------------- + + @Advanced + @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) + protected List kmerSizes = Arrays.asList(10, 25); + + @Advanced + @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) + protected boolean dontIncreaseKmerSizesForCycles = false; + + @Advanced + @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) + protected int numPruningSamples = 1; + + /** + * This mode is currently experimental and should only be used in the RNA-seq calling pipeline. + */ + @Advanced + @Argument(fullName="recoverDanglingHeads", shortName="recoverDanglingHeads", doc="Should we enable dangling head recovery in the read threading assembler?", required = false) + protected boolean recoverDanglingHeads = false; + + @Hidden + @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) + protected boolean dontRecoverDanglingTails = false; + + // ----------------------------------------------------------------------------------------------- + // general advanced arguments to control haplotype caller behavior + // ----------------------------------------------------------------------------------------------- + + /** + * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. + * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works. + * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!). + * This requirement is a temporary workaround for an issue with index compression. + */ + @Advanced + @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting experimental reference confidence scores", required = false) + protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; + + public enum ReferenceConfidenceMode { + NONE, + BP_RESOLUTION, + GVCF + } + + /** + * The GQ partition intervals + * + * Should be a non-empty list of boundaries. For example, suppose this variable is + * + * [A, B, C] + * + * We would partition our hom-ref sites into the following bands: + * + * X < A + * A <= X < B + * B <= X < C + * X >= C + * + * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: + * + * [0, 0] + * (0, 10] + * (10, 20] + * (20, 30] + * (30, 40] + * (40, 50] + * (50, 99] + * + * Note that in the GATK GQ values are capped at 99. + */ + @Advanced + @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) + protected List GVCFGQBands = Arrays.asList(5, 20, 60); + + /** + * This parameter determines the maximum size of an indel considered as potentially segregating in the + * reference model. It is used to eliminate reads from being indel informative at a site, and determines + * by that mechanism the certainty in the reference base. Conceptually, setting this parameter to + * X means that each informative read is consistent with any indel of size < X being present at a specific + * position in the genome, given its alignment to the reference. + */ + @Advanced + @Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false) + protected int indelSizeToEliminateInRefModel = 10; + + // ----------------------------------------------------------------------------------------------- + // general advanced arguments to control haplotype caller behavior + // ----------------------------------------------------------------------------------------------- + + /** + * The minimum confidence needed for a given base for it to be used in variant calling. + */ + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) + public byte MIN_BASE_QUALTY_SCORE = 10; + + /** + * Users should be aware that this argument can really affect the results of the variant calling and should exercise caution. + * Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the + * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values + * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher + * depth to produce calls). + */ + @Advanced + @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false) + protected int MIN_PRUNE_FACTOR = 2; + + @Advanced + @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) + protected int gcpHMM = 10; + + /** + * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling + * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the + * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking + * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, + * and may make use of them in assembly and calling, where possible. + */ + @Hidden + @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) + protected boolean includeUnmappedReads = false; + + @Advanced + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) + protected boolean USE_ALLELES_TRIGGER = false; + + @Advanced + @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) + protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; + + /** + * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their + * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of + * its edit distance from the reference, in that the read could have originated from the reference haplotype but + * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but + * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence + * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single + * read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference + * that this (and any) read could contribute against reference is Q30. + * + * Set this term to any negative number to turn off the global mapping rate + */ + @Advanced + @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) + protected int phredScaledGlobalReadMismappingRate = 45; + + /** + * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype + * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the + * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their + * weights, no matter how many paths are possible to generate from the graph. Putting this number too low + * will result in dropping true variation because paths that include the real variant are not even considered. + */ + @Advanced + @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) + protected int maxNumHaplotypesInPopulation = 128; + + @Advanced + @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) + protected boolean mergeVariantsViaLD = false; + + // ----------------------------------------------------------------------------------------------- + // arguments for debugging / developing the haplotype caller + // ----------------------------------------------------------------------------------------------- + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Hidden + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + + @Hidden + @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) + protected String keepRG = null; + + @Hidden + @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) + protected boolean justDetermineActiveRegions = false; + + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + + @Advanced + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) + protected boolean DEBUG; + + @Hidden + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) + protected boolean debugGraphTransformations = false; + + @Advanced + @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) + protected boolean dontUseSoftClippedBases = false; + + @Hidden + @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) + protected boolean captureAssemblyFailureBAM = false; + + @Hidden + @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) + protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + + @Hidden + @Argument(fullName="noFpga", shortName="noFpga", doc="If provided, disables the use of the FPGA HMM implementation", required = false) + protected boolean noFpga = false; + + // Parameters to control read error correction + @Hidden + @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectReads = false; + + @Hidden + @Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected int kmerLengthForReadErrorCorrection = 25; + + @Hidden + @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) + protected int minObservationsForKmerToBeSolid = 20; + + /** + * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base + * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). + * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE. + */ + @Advanced + @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) + public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; + + // ----------------------------------------------------------------------------------------------- + // done with Haplotype caller parameters + // ----------------------------------------------------------------------------------------------- + + // the UG engines + private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; + + // the assembly engine + private LocalAssemblyEngine assemblyEngine = null; + + // the likelihoods engine + private LikelihoodCalculationEngine likelihoodCalculationEngine = null; + + // the genotyping engine + private GenotypingEngine genotypingEngine = null; + + // fasta reference reader to supplement the edges of the reference sequence + protected CachingIndexedFastaSequenceFile referenceReader; + + // reference base padding size + private static final int REFERENCE_PADDING = 500; + + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + + private byte MIN_TAIL_QUALITY; + private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; + + // the minimum length of a read we'd consider using for genotyping + private final static int MIN_READ_LENGTH = 10; + + private List samplesList = new ArrayList<>(); + + private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + + ReferenceConfidenceModel referenceConfidenceModel = null; + + // as determined experimentally Nov-Dec 2013 + public final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; + public final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000; + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + super.initialize(); + + if (dontGenotype && emitReferenceConfidence == ReferenceConfidenceMode.GVCF) + throw new UserException("You cannot request gVCF output and do not genotype at the same time"); + + if ( emitReferenceConfidence() ) { + SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; + SCAC.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; + + // also, we don't need to output several of the annotations + annotationsToExclude.add("ChromosomeCounts"); + annotationsToExclude.add("FisherStrand"); + annotationsToExclude.add("QualByDepth"); + + // but we definitely want certain other ones + annotationsToUse.add("StrandBiasBySample"); + logger.info("Standard Emitting and Calling confidence set to 0.0 for reference-model confidence output"); + } + + if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) + throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); + + // get all of the unique sample names + Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samplesList.addAll( samples ); + // initialize the UnifiedGenotyper Engine which is used to call into the exact model + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user + // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine + UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + if (emitReferenceConfidence() && !UG_engine.getUAC().annotateAllSitesWithPLs) { + UG_engine.getUAC().annotateAllSitesWithPLs = true; + logger.info("All sites annotated with PLs force to true for reference-model confidence output"); + } + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; + simpleUAC.CONTAMINATION_FRACTION_FILE = null; + simpleUAC.exactCallsLog = null; + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + if( UAC.CONTAMINATION_FRACTION_FILE != null ) { + UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); + } + + // initialize the output VCF header + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + + Set headerInfo = new HashSet<>(); + + // all annotation fields from VariantAnnotatorEngine + headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); + // all callers need to add these standard annotation header lines + VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, + VCFConstants.DOWNSAMPLED_KEY, + VCFConstants.MLE_ALLELE_COUNT_KEY, + VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + // all callers need to add these standard FORMAT field header lines + VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true, + VCFConstants.GENOTYPE_KEY, + VCFConstants.GENOTYPE_QUALITY_KEY, + VCFConstants.DEPTH_KEY, + VCFConstants.GENOTYPE_PL_KEY); + + // FILTER fields are added unconditionally as it's not always 100% certain the circumstances + // where the filters are used. For example, in emitting all sites the lowQual field is used + headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + + initializeReferenceConfidenceModel(samples, headerInfo); + + vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); + + try { + // fasta reference reader to supplement the edges of the reference sequence + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); + } + + // create and setup the assembler + assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); + + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); + assemblyEngine.setDebug(DEBUG); + assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); + assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); + assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); + assemblyEngine.setRecoverDanglingHeads(recoverDanglingHeads); + assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE); + + MIN_TAIL_QUALITY = (byte)(MIN_BASE_QUALTY_SCORE - 1); + + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + + // setup the likelihood calculation engine + if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; + + // configure the global mismapping rate + if ( phredScaledGlobalReadMismappingRate < 0 ) { + log10GlobalReadMismappingRate = - Double.MAX_VALUE; + } else { + log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate); + logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); + } + + // create our likelihood calculation engine + likelihoodCalculationEngine = createLikelihoodCalculationEngine(); + + final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); + + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); + + if ( bamWriter != null ) { + // we currently do not support multi-threaded BAM writing, so exception out + if ( getToolkit().getTotalNumberOfThreads() > 1 ) + throw new UserException.BadArgumentValue("bamout", "Currently cannot emit a BAM file from the HaplotypeCaller in multi-threaded mode."); + haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); + } + + trimmer.initialize(getToolkit().getGenomeLocParser(), DEBUG, + UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); + } + + private void initializeReferenceConfidenceModel(final Set samples, final Set headerInfo) { + referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); + if ( emitReferenceConfidence() ) { + if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); + headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + // a kluge to enforce the use of this indexing strategy + if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || + getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { + throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); + } + + try { + vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); + } + } + } + } + + /** + * Instantiates the appropriate likelihood calculation engine. + * + * @return never {@code null}. + */ + private LikelihoodCalculationEngine createLikelihoodCalculationEngine() { + switch (likelihoodEngineImplementation) { + case PairHMM: + return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); + case GraphBased: + return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations); + case Random: + return new RandomLikelihoodCalculationEngine(); + default: + //Note: we do not include in the error message list as it is of no grand public interest. + throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine + + "'. Please use one of the following instead: 'PairHMM' and 'GraphBased'."); + } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // isActive + // + //--------------------------------------------------------------------------------------------------------------- + + // enable deletions in the pileup + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + // enable non primary and extended reads in the active region + @Override + public EnumSet desiredReadStates() { + if ( includeUnmappedReads ) { + throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); +// return EnumSet.of( +// ActiveRegionReadState.PRIMARY, +// ActiveRegionReadState.NONPRIMARY, +// ActiveRegionReadState.EXTENDED, +// ActiveRegionReadState.UNMAPPED +// ); + } else + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); + } + + @Override + @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) + public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); + if( vcFromAllelesRod != null ) { + return new ActivityProfileState(ref.getLocus(), 1.0); + } + } + + if( USE_ALLELES_TRIGGER ) { + return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + } + + if( context == null || context.getBasePileup().isEmpty() ) + // if we don't have any data, just abort early + return new ActivityProfileState(ref.getLocus(), 0.0); + + final List noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied + final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); + final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); + final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage(); + for( final Map.Entry sample : splitContexts.entrySet() ) { + final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods; + genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); + } + + final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE); + final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP); + final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); + + return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + private final static List NO_CALLS = Collections.emptyList(); + @Override + public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { + if ( justDetermineActiveRegions ) + // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work + return NO_CALLS; + + if( !originalActiveRegion.isActive() ) + // Not active so nothing to do! + return referenceModelForNoVariation(originalActiveRegion, true); + + final List activeAllelesToGenotype = new ArrayList<>(); + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) { + if ( vc.isNotFiltered() ) { + activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode + } + } + // No alleles found in this region so nothing to do! + if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } + } else { + // No reads here so nothing to do! + if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } + } + + // run the local assembler, getting back a collection of information on how we should proceed + final AssemblyResultSet untrimmedAssemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); + + final TreeSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); + // TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway + // TODO - so check and remove if that is the case: + allVariationEvents.addAll(activeAllelesToGenotype); + + final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); + + if (!trimmingResult.isVariationPresent()) + return referenceModelForNoVariation(originalActiveRegion,false); + + final AssemblyResultSet assemblyResult = + trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; + + final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); + + // filter out reads from genotyping which fail mapping quality based criteria + //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method + //TODO - on the originalActiveRegion? + //TODO - if you move this up you might have to consider to change referenceModelForNoVariation + //TODO - that does also filter reads. + final Collection filteredReads = filterNonPassingReads( regionForGenotyping ); + final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); + + // abort early if something is out of the acceptable range + // TODO is this ever true at this point??? perhaps GGA. Need to check. + if( ! assemblyResult.isVariationPresent() ) + return referenceModelForNoVariation(originalActiveRegion, false); + + // For sure this is not true if gVCF is on. + if (dontGenotype) return NO_CALLS; // user requested we not proceed + + + // TODO is this ever true at this point??? perhaps GGA. Need to check. + if( regionForGenotyping.size() == 0 ) { + // no reads remain after filtering so nothing else to do! + return referenceModelForNoVariation(originalActiveRegion, false); + } + + // evaluate each sample's reads against all haplotypes + //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); + final List haplotypes = assemblyResult.getHaplotypeList(); + final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); + + // Calculate the likelihoods: CPU intesive part. + final Map stratifiedReadMap = + likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); + + // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there + // was a bad interaction between that selection and the marginalization that happens over each event when computing + // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the + // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included + // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] + + final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, + haplotypes, + stratifiedReadMap, + perSampleFilteredReadList, + assemblyResult.getFullReferenceWithPadding(), + assemblyResult.getPaddedReferenceLoc(), + regionForGenotyping.getLocation(), + getToolkit().getGenomeLocParser(), + metaDataTracker, + activeAllelesToGenotype, emitReferenceConfidence() ); + + // TODO -- must disable if we are doing NCT, or set the output type of ! presorted + if ( bamWriter != null ) { + haplotypeBAMWriter.writeReadsAlignedToHaplotypes( + haplotypes, + assemblyResult.getPaddedReferenceLoc(), + haplotypes, + calledHaplotypes.getCalledHaplotypes(), + stratifiedReadMap); + } + + if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } + + + if ( emitReferenceConfidence() ) { + if ( !containsCalls(calledHaplotypes) ) { + // no called all of the potential haplotypes + return referenceModelForNoVariation(originalActiveRegion, false); + } else { + final List result = new LinkedList<>(); + // output left-flanking non-variant section: + if (trimmingResult.hasLeftFlankingRegion()) + result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantLeftFlankRegion(),false)); + // output variant containing region. + result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), + calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, + stratifiedReadMap, calledHaplotypes.getCalls())); + // output right-flanking non-variant section: + if (trimmingResult.hasRightFlankingRegion()) + result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false)); + return result; + } + } else { + return calledHaplotypes.getCalls(); + } + } + + private boolean containsCalls(final GenotypingEngine.CalledHaplotypes calledHaplotypes) { + final List calls = calledHaplotypes.getCalls(); + if (calls.isEmpty()) return false; + for (final VariantContext call : calls) + for (final Genotype genotype : call.getGenotypes()) + if (genotype.isCalled()) + return true; + return false; + } + + /** + * High-level function that runs the assembler on the active region reads, + * returning a data structure with the resulting information needed + * for further HC steps + * + * @param activeRegion the region we should assemble + * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) + * @return the AssemblyResult describing how to proceed with genotyping + */ + protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + // Create the reference haplotype which is the bases from the reference that make up the active region + finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails + + final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); + final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); + final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); + + // Create ReadErrorCorrector object if requested - will be used within assembly engine. + ReadErrorCorrector readErrorCorrector = null; + if (errorCorrectReads) + readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); + + try { + final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); + assemblyResultSet.debugDump(logger); + return assemblyResultSet; + + } catch ( final Exception e ) { + // Capture any exception that might be thrown, and write out the assembly failure BAM if requested + if ( captureAssemblyFailureBAM ) { + final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); + for ( final GATKSAMRecord read : activeRegion.getReads() ) { + writer.addAlignment(read); + } + writer.close(); + } + throw e; + } + } + + /** + * Helper function to create the reference haplotype out of the active region and a padded loc + * @param activeRegion the active region from which to generate the reference haplotype + * @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be + * @return a non-null haplotype + */ + private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) { + return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc); + } + + /** + * Create an ref model result (ref model or no calls depending on mode) for an active region without any variation + * (not is active, or assembled to just ref) + * + * @param region the region to return a no-variation result + * @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done) + * @return a list of variant contexts (can be empty) to emit for this ref region + */ + private List referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) { + if ( emitReferenceConfidence() ) { + //TODO - why the activeRegion cannot manage its own one-time finalization and filtering? + //TODO - perhaps we can remove the last parameter of this method and the three lines bellow? + if ( needsToBeFinalized ) + finalizeActiveRegion(region); + filterNonPassingReads(region); + + final GenomeLoc paddedLoc = region.getExtendedLoc(); + final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc); + final List haplotypes = Collections.singletonList(refHaplotype); + return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, + paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), + Collections.emptyList()); + } else + return NO_CALLS; + } + + /** + * Create a context that maps each read to the reference haplotype with log10 L of 0 + * @param refHaplotype a non-null reference haplotype + * @param samples a list of all samples + * @param region the active region containing reads + * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref + */ + public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype, + final List samples, + final ActiveRegion region) { + final Allele refAllele = Allele.create(refHaplotype, true); + + final Map map = new LinkedHashMap<>(1); + for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for ( final GATKSAMRecord read : entry.getValue() ) { + likelihoodMap.add(read, refAllele, 0.0); + } + map.put(entry.getKey(), likelihoodMap); + } + + return map; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(List callsInRegion, Integer numCalledRegions) { + for( final VariantContext call : callsInRegion ) { + vcfWriter.add( call ); + } + return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions; + } + + @Override + public void onTraversalDone(Integer result) { + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it + referenceConfidenceModel.close(); + //TODO remove the need to call close here for debugging, the likelihood output stream should be managed + //TODO (open & close) at the walker, not the engine. + //likelihoodCalculationEngine.close(); + logger.info("Ran local assembly on " + result + " active regions"); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // private helper functions + // + //--------------------------------------------------------------------------------------------------------------- + + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { + if (activeRegion.isFinalized()) return; + + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + + // Loop through the reads hard clipping the adaptor and low quality tails + final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); + for( final GATKSAMRecord myRead : activeRegion.getReads() ) { + GATKSAMRecord clippedRead; + if (errorCorrectReads) + clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION ); + else // default case: clip low qual ends of reads + clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY ); + + if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) { + // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all + clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + } else { + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + } + + clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) ); + if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); + if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); + readsToUse.add(clippedRead); + } + } + } + + // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. + + final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); + + // handle overlapping read pairs from the same fragment + cleanOverlappingReadPairs(downsampledReads); + + activeRegion.clearReads(); + activeRegion.addAll(downsampledReads); + activeRegion.setFinalized(true); + } + + private Set filterNonPassingReads( final ActiveRegion activeRegion ) { + final Set readsToRemove = new LinkedHashSet<>(); + for( final GATKSAMRecord rec : activeRegion.getReads() ) { + if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + readsToRemove.add(rec); + } + } + activeRegion.removeAll( readsToRemove ); + return readsToRemove; + } + + private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { + final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); + final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); + return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); + } + + private Map> splitReadsBySample( final Collection reads ) { + return splitReadsBySample(samplesList, reads); + } + + public static Map> splitReadsBySample( final List samplesList, final Collection reads ) { + final Map> returnMap = new HashMap<>(); + for( final String sample : samplesList) { + List readList = returnMap.get( sample ); + if( readList == null ) { + readList = new ArrayList<>(); + returnMap.put(sample, readList); + } + } + for( final GATKSAMRecord read : reads ) { + returnMap.get(read.getReadGroup().getSample()).add(read); + } + + return returnMap; + } + + /** + * Are we emitting a reference confidence in some form, or not? + * @return true if we are + */ + private boolean emitReferenceConfidence(){ + return emitReferenceConfidence != ReferenceConfidenceMode.NONE; + } + + /** + * Clean up reads/bases that overlap within read pairs + * + * @param reads the list of reads to consider + */ + private void cleanOverlappingReadPairs(final List reads) { + for ( final List perSampleReadList : splitReadsBySample(reads).values() ) { + final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); + for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) + FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java new file mode 100644 index 000000000..cfd07da67 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -0,0 +1,467 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.util.*; + +/** + * Haplotype-based resolution of variants in 2 different eval files. + * + *

+ * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. + * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. + * Records are annotated with the set and status attributes. + * + *

Input

+ *

+ * 2 variant files to resolve. + *

+ * + *

Output

+ *

+ * A single consensus VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx1g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T HaplotypeResolver \
+ *   -V:v1 input1.vcf \
+ *   -V:v2 input2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW)) +public class HaplotypeResolver extends RodWalker { + + protected static final String INTERSECTION_SET = "intersection"; + protected static final String SAME_STATUS = "same"; + protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch"; + protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles"; + protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype"; + protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther"; + protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents"; + + protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50; + protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000; + protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100; + protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER; + + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variants; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter baseWriter = null; + private VariantContextWriter writer; + + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) + protected String SET_KEY = "set"; + + /** + * Set to 'null' if you don't want the status field emitted. + */ + @Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false) + protected String STATUS_KEY = "status"; + + private final LinkedList queue = new LinkedList(); + private String source1, source2; + private final List sourceVCs1 = new ArrayList(); + private final List sourceVCs2 = new ArrayList(); + + + private class VCcontext { + public final Collection vcs; + public final GenomeLoc loc; + public final ReferenceContext ref; + + public VCcontext(final Collection vcs, final ReferenceContext ref) { + this.vcs = vcs; + this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next()); + this.ref = ref; + } + } + + public void initialize() { + + if ( variants.size() != 2 ) { + throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files"); + } + source1 = variants.get(0).getName(); + source2 = variants.get(1).getName(); + + if ( SET_KEY.toLowerCase().equals("null") ) + SET_KEY = null; + if ( STATUS_KEY.toLowerCase().equals("null") ) + STATUS_KEY = null; + + // for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header) + Set headerLines = new HashSet(); + if ( SET_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record")); + if ( STATUS_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match")); + final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.emptySet()); + baseWriter.writeHeader(vcfHeader); + writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + final Collection VCs = tracker.getValues(variants, context.getLocation()); + if ( VCs.size() == 0 ) + return 0; + + final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref); + + // TODO -- what should we do about filtered records? + + if ( !queue.isEmpty() ) { + + final VCcontext previous = queue.getLast(); + if ( !previous.loc.onSameContig(vc.loc) || + previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS || + queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) { + purgeQueue(); + } + } + + queue.addLast(vc); + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + if ( !queue.isEmpty() ) + purgeQueue(); + writer.close(); + } + + private void purgeQueue() { + + final ReferenceContext refContext = queue.getFirst().ref; + + // divide them up by source + while ( !queue.isEmpty() ) { + VCcontext context = queue.removeFirst(); + for ( final VariantContext vc: context.vcs ) { + if ( vc.getSource().equals(source1) ) + sourceVCs1.add(vc); + else + sourceVCs2.add(vc); + } + } + + writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS); + + if ( sourceVCs1.isEmpty() ) { + writeAll(sourceVCs2, source2, null); + } else if ( sourceVCs2.isEmpty() ) { + writeAll(sourceVCs1, source1, null); + } else { + resolveByHaplotype(refContext); + } + + // allow for GC of the data + sourceVCs1.clear(); + sourceVCs2.clear(); + } + + private void writeAll(final List sourceVCs, final String set, final String status) { + for ( final VariantContext vc : sourceVCs ) { + writeOne(vc, set, status); + } + } + + private void writeOne(final VariantContext vc, final String set, final String status) { + final Map attrs = new HashMap<>(); + if ( SET_KEY != null && set != null ) + attrs.put(SET_KEY, set); + if ( STATUS_KEY != null && status != null ) + attrs.put(STATUS_KEY, status); + writer.add(new VariantContextBuilder(vc).attributes(attrs).make()); + } + + private void writeAndPurgeAllEqualVariants(final List sourceVCs1, final List sourceVCs2, final String status) { + + int currentIndex1 = 0, currentIndex2 = 0; + int size1 = sourceVCs1.size(), size2 = sourceVCs2.size(); + VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + + while ( current1 != null && current2 != null ) { + + final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); + final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); + + if ( loc1.equals(loc2) || + (loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) { + // test the alleles + if ( determineAndWriteOverlap(current1, current2, status) ) { + sourceVCs1.remove(currentIndex1); + sourceVCs2.remove(currentIndex2); + size1--; + size2--; + } else { + currentIndex1++; + currentIndex2++; + } + current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + } else if ( loc1.isBefore(loc2) ) { + currentIndex1++; + current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + } else { + currentIndex2++; + current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + } + } + } + + private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) { + final int allelesFrom1In2 = findOverlap(vc1, vc2); + final int allelesFrom2In1 = findOverlap(vc2, vc1); + final int totalAllelesIn1 = vc1.getAlternateAlleles().size(); + final int totalAllelesIn2 = vc2.getAlternateAlleles().size(); + + final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1; + final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2; + + boolean thereIsOverlap = true; + + if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) { + writeOne(vc1, INTERSECTION_SET, status); + } else if ( allAllelesFrom1Overlap ) { + writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2); + } else if ( allAllelesFrom2Overlap ) { + writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1); + } else if ( allelesFrom1In2 > 0 ) { + writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS); + } else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction + writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS); + } else { + thereIsOverlap = false; + } + + return thereIsOverlap; + } + + private static int findOverlap(final VariantContext target, final VariantContext comparison) { + int overlap = 0; + for ( final Allele allele : target.getAlternateAlleles() ) { + if ( comparison.hasAlternateAllele(allele) ) + overlap++; + } + return overlap; + } + + private static final double SW_MATCH = 4.0; + private static final double SW_MISMATCH = -10.0; + private static final double SW_GAP = -25.0; + private static final double SW_GAP_EXTEND = -1.3; + private void resolveByHaplotype(final ReferenceContext refContext) { + + final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext); + final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext); + + final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + + // protect against SW failures + if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 || + swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) { + // TODO -- handle errors appropriately + logger.debug("Bad SW alignment; aborting at " + refContext.getLocus()); + return; + } + + // order results by start position + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); + if ( source1Map.size() == 0 || source2Map.size() == 0 ) { + // TODO -- handle errors appropriately + logger.debug("No source alleles; aborting at " + refContext.getLocus()); + return; + } + + // create lists and test for equality + final List source1Alleles = new ArrayList(source1Map.values()); + final List source2Alleles = new ArrayList(source2Map.values()); + + writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS); + if ( source1Alleles.isEmpty() ) { + writeAll(source2Alleles, source2, null); + } else if ( source2Alleles.isEmpty() ) { + writeAll(source1Alleles, source1, null); + } else { + writeDifferences(source1Alleles, source2Alleles); + } + } + + private byte[] generateHaplotype(final List sourceVCs, final ReferenceContext refContext) { + + final StringBuilder sb = new StringBuilder(); + + final int startPos = refContext.getWindow().getStart(); + int currentPos = startPos; + final byte[] reference = refContext.getBases(); + + for ( final VariantContext vc : sourceVCs ) { + // add any missing reference context + int vcStart = vc.getStart(); + final int refAlleleLength = vc.getReference().length(); + if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position) + vcStart++; + + while ( currentPos < vcStart ) + sb.append((char)reference[currentPos++ - startPos]); + + // add the alt allele + sb.append(vc.getAlternateAllele(0).getBaseString()); + + // skip the reference allele + currentPos += refAlleleLength; + } + // add any missing reference context + final int stopPos = refContext.getWindow().getStop(); + while ( currentPos < stopPos ) + sb.append((char)reference[currentPos++ - startPos]); + + return sb.toString().getBytes(); + } + + private void writeDifferences(final List source1Alleles, final List source2Alleles) { + int currentIndex1 = 0, currentIndex2 = 0; + final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); + VariantContext current1 = source1Alleles.get(0); + VariantContext current2 = source2Alleles.get(0); + + while ( currentIndex1 < size1 || currentIndex2 < size2 ) { + if ( current1 == null ) { + writeOne(current2, source2, null); + currentIndex2++; + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } else if ( current2 == null ) { + writeOne(current1, source1, null); + currentIndex1++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + } else { + + final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); + final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); + + if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) { + String status; + if ( loc1.getStart() == loc2.getStart() ) { + final String allele1 = current1.getAlternateAllele(0).getBaseString(); + final String allele2 = current2.getAlternateAllele(0).getBaseString(); + if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 ) + status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; + else + status = SAME_START_DIFFERENT_ALLELES_STATUS; + } else { + status = OVERLAPPING_EVENTS_STATUS; + } + + writeOne(current1, INTERSECTION_SET, status); + currentIndex1++; + currentIndex2++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } else if ( loc1.isBefore(loc2) ) { + writeOne(current1, source1, null); + currentIndex1++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + } else { + writeOne(current2, source2, null); + currentIndex2++; + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java new file mode 100644 index 000000000..102562504 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java @@ -0,0 +1,452 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.lang.reflect.Array; +import java.util.*; + +/** + * Represent a sequence of kmers where any two consecutive kmers overlap in kmer length - 1 elements. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class KmerSequence implements List { + private final byte[] sequence; + private final int start; + private final int size; + private final int kmerSize; + private final int rawLength; + + /** + * Creates a kmer sequence from a read's sequence. + * + * @param read the read to represent as a sequence of kmers. + * @param kmerSize the kmer size. + */ + public KmerSequence(final SAMRecord read, final int kmerSize) { + this(read.getReadBases(), kmerSize); + } + + /** + * Creates a kmer sequence from a haplotype's sequence. + * + * @param hap the haplotype to represent as a sequence of kmers. + * @param kmerSize the kmer size. + */ + @SuppressWarnings("unused") + public KmerSequence(final Haplotype hap, final int kmerSize) { + this(hap.getBases(), kmerSize); + } + + /** + * Creates a kmer sequence out of a byte sequence. + * + * @param sequence the byte array to represent as a kmer sequence. + * @param kmerSize the kmer size. + */ + public KmerSequence(final byte[] sequence, final int kmerSize) { + this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length); + } + + /** + * Creates a kmer sequence out of a range of a byte array + * + * @param sequence the input array. + * @param start inclusive first position of the array that maps to the first position in the first kmer. + * @param size number kmers in the output. + * @param kmerSize kmer length in bases. + * @param rawLength the of the range in bases. + */ + protected KmerSequence(final byte[] sequence, final int start, final int size, final int kmerSize, final int rawLength) { + if (sequence == null) { + throw new IllegalArgumentException("start must be 0 or greater"); + } + if (rawLength > sequence.length - start) { + throw new IllegalArgumentException("the raw sequence length goes beyond the array capacity"); + } + if (size < 0) { + throw new IllegalArgumentException("the length cannot be negative"); + } + if (start < 0) { + throw new IllegalArgumentException("start must be 0 or greater"); + } + if (size > 0 && size + kmerSize - 1 > rawLength) { + throw new IllegalArgumentException( + String.format("the kmerSize (%d) + size (%d) - 1 cannot be larger than rawLength (%d)",kmerSize,size,rawLength) ); + } + this.sequence = sequence; + this.start = start; + this.size = size; + this.kmerSize = kmerSize; + this.rawLength = rawLength; + } + + public int kmerSize() { + return kmerSize; + } + + public KmerSequence subsequence(final int from, final int to) { + if (from < 0 || from > to) { + throw new IllegalArgumentException(); + } + if (to > size) { + throw new IllegalArgumentException(); + } + return new KmerSequence(sequence,this.start + from,to - from,kmerSize,rawLength - from - (size - to)); + } + + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size == 0; + } + + @Override + public boolean contains(final Object o) { + if (o instanceof Kmer) { + if (o instanceof MyKmer) { + final MyKmer k = (MyKmer) o; + if (k.bases == sequence && k.start >= start && k.length == kmerSize && k.start < start + size) { + return true; + } + } + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return false; + } + for (int i = 0; i < size; i++) { + int j; + for (j = 0; j < kmerSize; j++) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == kmerSize) { + return true; + } + } + return false; + } else { + return false; + } + } + + @Override + public Iterator iterator() { + return new Iterator() { + + private int offset = 0; + + @Override + public boolean hasNext() { + return offset < size; + } + + @Override + public Kmer next() { + return new Kmer(sequence,start + offset,kmerSize); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @Override + public Object[] toArray() { + return toArray(new Kmer[size()]); + } + + @Override + @SuppressWarnings("unchecked") + public T[] toArray(final T[] a) { + if (a == null) { + throw new IllegalArgumentException(); + } else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) { + throw new IllegalArgumentException(); + } else { + T[] result; + if (a.length < size) { + result = (T[]) Array.newInstance(a.getClass().getComponentType(), size); + } else { + result = a; + } + for (int i = 0; i < size; i++) { + result[i] = (T) new Kmer(sequence,start + i,kmerSize); + } + return result; + } + } + + @Override + public boolean add(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean remove(final Object o) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean containsAll(final Collection c) { + for (final Object o : c) + if (!contains(o)) + return false; + return true; + } + + @Override + public boolean addAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean addAll(final int index, final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean removeAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean retainAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public void clear() { + throw new UnsupportedOperationException(); + } + + @Override + public Kmer get(final int index) { + if (index < 0 || index >= size) { + throw new IllegalArgumentException(); + } + return new Kmer(sequence,start + index,kmerSize); + } + + @Override + public Kmer set(final int index, final Kmer element) { + throw new UnsupportedOperationException(); + } + + @Override + public void add(final int index, final Kmer element) { + throw new UnsupportedOperationException(); + } + + @Override + public Kmer remove(final int index) { + throw new UnsupportedOperationException(); + } + + @Override + public int indexOf(final Object o) { + if (o instanceof Kmer) { + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return -1; + } + for (int i = 0; i < size; i++) { + int j; + for (j = 0; j < kmerSize; j++) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == kmerSize) { + return i; + } + } + return -1; + } else { + return -1; + } + } + + @Override + public int lastIndexOf(final Object o) { + if (o instanceof Kmer) { + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return -1; + } + for (int i = size - 1; i >= 0; i--) { + int j; + for (j = kmerSize - 1; j >= 0; j--) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == 0) { + return i; + } + } + return -1; + } else { + return -1; + } + } + + @Override + public ListIterator listIterator() { + return new MyListIterator(0); + } + + @Override + public ListIterator listIterator(final int index) { + return new MyListIterator(index); + } + + @Override + public List subList(final int fromIndex, final int toIndex) { + return subsequence(fromIndex,toIndex); + } + + /** + * Returns the byte array representation of the kmer sequence. + * @return never {@code null}. + */ + public byte[] getBytes() { + if (start == 0 && rawLength == sequence.length) + return sequence; + else + return Arrays.copyOfRange(sequence, start, rawLength + start); + } + + /** + * Internal class that implements the {@link Kmer} more efficiently + * making reference to the sequence's own byte array. + */ + protected class MyKmer extends Kmer { + + /** + * Create a new instance give the offset in the byte array. + * @param start the start base offset for the kmer. + */ + public MyKmer(final int start) { + super(sequence,start,kmerSize); + } + } + + /** + * Iterator implementation of Kmer elements. + */ + private class MyListIterator implements ListIterator { + + private int i = 0; + + /** + * Creates a iterator at certain offset in the sequence. + * @param idx the start position or kmer offset. + */ + private MyListIterator(final int idx) { + i = idx; + } + + @Override + public boolean hasNext() { + return i < size; + } + + @Override + public Kmer next() { + return new Kmer(sequence,start + i++,kmerSize); + } + + @Override + public boolean hasPrevious() { + return i > 0; + } + + @Override + public Kmer previous() { + return new Kmer(sequence,start + --i,kmerSize); + } + + @Override + public int nextIndex() { + return i; + } + + @Override + public int previousIndex() { + return i - 1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void set(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + @Override + public void add(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java new file mode 100644 index 000000000..8dfeed987 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -0,0 +1,469 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Abstract base class for all HaplotypeCaller assemblers + * + * User: ebanks + * Date: Mar 14, 2011 + */ +public abstract class LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class); + + /** + * If false, we will only write out a region around the reference source + */ + private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true; + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 10; + private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30; + + protected final int numBestHaplotypesPerGraph; + + protected boolean debug = false; + protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + protected boolean debugGraphTransformations = false; + protected boolean recoverDanglingTails = true; + protected boolean recoverDanglingHeads = true; + + protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; + protected int pruneFactor = 2; + protected boolean errorCorrectKmers = false; + + private PrintStream graphWriter = null; + + /** + * Create a new LocalAssemblyEngine with all default parameters, ready for use + * @param numBestHaplotypesPerGraph the number of haplotypes to generate for each assembled graph + */ + protected LocalAssemblyEngine(final int numBestHaplotypesPerGraph) { + if ( numBestHaplotypesPerGraph < 1 ) throw new IllegalArgumentException("numBestHaplotypesPerGraph should be >= 1 but got " + numBestHaplotypesPerGraph); + this.numBestHaplotypesPerGraph = numBestHaplotypesPerGraph; + } + + /** + * Main subclass function: given reads and a reference haplotype give us graphs to use for constructing + * non-reference haplotypes. + * + * @param reads the reads we're going to assemble + * @param refHaplotype the reference haplotype + * @return a non-null list of reads + */ + protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes); + + protected List assemble(List reads, Haplotype refHaplotype) { + return assemble(reads, refHaplotype, Collections.emptyList()); + } + + /** + * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads + * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly + * @param refHaplotype reference haplotype object + * @param fullReferenceWithPadding byte array holding the reference sequence with padding + * @param refLoc GenomeLoc object corresponding to the reference sequence with padding + * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode + * @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used. + * @return the resulting assembly-result-set + */ + public AssemblyResultSet runLocalAssembly(final ActiveRegion activeRegion, + final Haplotype refHaplotype, + final byte[] fullReferenceWithPadding, + final GenomeLoc refLoc, + final List activeAllelesToGenotype, + final ReadErrorCorrector readErrorCorrector) { + if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } + if( activeRegion.getExtendedLoc() == null ) { throw new IllegalArgumentException("Active region must have an extended location."); } + if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } + if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } + if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } + + // create the list of artificial haplotypes that should be added to the graph for GGA mode + final List activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc()); + + // error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them + final List correctedReads; + if (readErrorCorrector != null) { + // now correct all reads in active region after filtering/downsampling + // Note that original reads in active region are NOT modified by default, since they will be used later for GL computation, + // and we only want the read-error corrected reads for graph building. + readErrorCorrector.addReadsToKmers(activeRegion.getReads()); + correctedReads = new ArrayList<>(readErrorCorrector.correctReads(activeRegion.getReads())); + } else { + correctedReads = activeRegion.getReads(); + } + + final List nonRefGraphs = new LinkedList<>(); + final AssemblyResultSet resultSet = new AssemblyResultSet(); + resultSet.setRegionForGenotyping(activeRegion); + resultSet.setFullReferenceWithPadding(fullReferenceWithPadding); + resultSet.setPaddedReferenceLoc(refLoc); + final GenomeLoc activeRegionExtendedLocation = activeRegion.getExtendedLoc(); + refHaplotype.setGenomeLocation(activeRegionExtendedLocation); + resultSet.add(refHaplotype); + final Map assemblyResultByGraph = new HashMap<>(); + // create the graphs by calling our subclass assemble method + for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) { + if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) { + // do some QC on the graph + sanityCheckGraph(result.getGraph(), refHaplotype); + // add it to graphs with meaningful non-reference features + assemblyResultByGraph.put(result.getGraph(),result); + nonRefGraphs.add(result.getGraph()); + } + + } + + findBestPaths (nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet); + + // print the graphs if the appropriate debug option has been turned on + if ( graphWriter != null ) { printGraphs(nonRefGraphs); } + + return resultSet; + } + + /** + * Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype + * @param refHaplotype the reference haplotype + * @param activeAllelesToGenotype the list of alternate alleles in VariantContexts + * @param activeRegionWindow the window containing the reference haplotype + * @return a non-null list of haplotypes + */ + private List createActiveAlleleHaplotypes(final Haplotype refHaplotype, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow) { + final Set returnHaplotypes = new LinkedHashSet<>(); + final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + + for( final VariantContext compVC : activeAllelesToGenotype ) { + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); + if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype + returnHaplotypes.add(insertedRefHaplotype); + } + } + } + + return new ArrayList<>(returnHaplotypes); + } + + + @Ensures({"result.contains(refHaplotype)"}) + protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, + final Map assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) { + // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + final Set returnHaplotypes = new LinkedHashSet<>(); + returnHaplotypes.add( refHaplotype ); + + final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + + for( final SeqGraph graph : graphs ) { + final SeqVertex source = graph.getReferenceSourceVertex(); + final SeqVertex sink = graph.getReferenceSinkVertex(); + if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); + final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph,source,sink); + final Iterator bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph); + while (bestHaplotypes.hasNext()) { + final KBestHaplotype kBestHaplotype = bestHaplotypes.next(); + final Haplotype h = kBestHaplotype.haplotype(); + if( !returnHaplotypes.contains(h) ) { + final Cigar cigar = CigarUtils.calculateCigar(refHaplotype.getBases(),h.getBases()); + + if ( cigar == null ) { + // couldn't produce a meaningful alignment of haplotype to reference, fail quietly + continue; + } else if( cigar.isEmpty() ) { + throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); + } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < MIN_HAPLOTYPE_REFERENCE_LENGTH ) { + // N cigar elements means that a bubble was too divergent from the reference so skip over this path + continue; + } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure + throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength() + + " ref = " + refHaplotype + " path " + new String(h.getBases())); + } + + h.setCigar(cigar); + h.setAlignmentStartHapwrtRef(activeRegionStart); + h.setGenomeLocation(activeRegionWindow); + returnHaplotypes.add(h); + assemblyResultSet.add(h, assemblyResultByGraph.get(graph)); + + if ( debug ) + logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize()); + } + } + } + + + if ( returnHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + + if( debug ) { + if( returnHaplotypes.size() > 1 ) { + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + } else { + logger.info("Found only the reference haplotype in the assembly graph."); + } + for( final Haplotype h : returnHaplotypes ) { + logger.info( h.toString() ); + logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() + " ref " + h.isReference()); + } + } + + return new ArrayList<>(returnHaplotypes); + + } + /** + * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal + * @param c the cigar to test + * @return true if we should skip over this path + */ + @Requires("c != null") + private boolean pathIsTooDivergentFromReference( final Cigar c ) { + for( final CigarElement ce : c.getCigarElements() ) { + if( ce.getOperator().equals(CigarOperator.N) ) { + return true; + } + } + return false; + } + + /** + * Print graph to file if debugGraphTransformations is enabled + * @param graph the graph to print + * @param file the destination file + */ + protected void printDebugGraphTransform(final BaseGraph graph, final File file) { + if ( debugGraphTransformations ) { + if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) + graph.printGraph(file, pruneFactor); + else + graph.subsetToRefSource().printGraph(file, pruneFactor); + } + } + + protected AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { + printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot")); + + // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive + seqGraph.zipLinearChains(); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot")); + + // now go through and prune the graph, removing vertices no longer connected to the reference chain + seqGraph.removeSingletonOrphanVertices(); + seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + + printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot")); + seqGraph.simplifyGraph(); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot")); + + // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can + // happen in cases where for example the reference somehow manages to acquire a cycle, or + // where the entire assembly collapses back into the reference sequence. + if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) + return new AssemblyResult(AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE, seqGraph); + + seqGraph.removePathsNotConnectedToRef(); + seqGraph.simplifyGraph(); + if ( seqGraph.vertexSet().size() == 1 ) { + // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop + // the code from blowing up. + // TODO -- ref properties should really be on the vertices, not the graph itself + final SeqVertex complete = seqGraph.vertexSet().iterator().next(); + final SeqVertex dummy = new SeqVertex(""); + seqGraph.addVertex(dummy); + seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); + } + printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot")); + return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph); + } + + /** + * Perform general QC on the graph to make sure something hasn't gone wrong during assembly + * @param graph the graph to check + * @param refHaplotype the reference haplotype + */ + private void sanityCheckGraph(final BaseGraph graph, final Haplotype refHaplotype) { + sanityCheckReferenceGraph(graph, refHaplotype); + } + + /** + * Make sure the reference sequence is properly represented in the provided graph + * + * @param graph the graph to check + * @param refHaplotype the reference haplotype + */ + private void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { + if( graph.getReferenceSourceVertex() == null ) { + throw new IllegalStateException("All reference graphs must have a reference source vertex."); + } + if( graph.getReferenceSinkVertex() == null ) { + throw new IllegalStateException("All reference graphs must have a reference sink vertex."); + } + if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) { + throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path. for graph " + graph + + " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) + + " haplotype = " + new String(refHaplotype.getBases()) + ); + } + } + + /** + * Print the generated graphs to the graphWriter + * @param graphs a non-null list of graphs to print out + */ + private void printGraphs(final List graphs) { + final int writeFirstGraphWithSizeSmallerThan = 50; + + graphWriter.println("digraph assemblyGraphs {"); + for( final SeqGraph graph : graphs ) { + if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); + continue; + } + + graph.printGraph(graphWriter, false, pruneFactor); + + if ( debugGraphTransformations ) + break; + } + + graphWriter.println("}"); + } + + // ----------------------------------------------------------------------------------------------- + // + // getter / setter routines for generic assembler properties + // + // ----------------------------------------------------------------------------------------------- + + public int getPruneFactor() { + return pruneFactor; + } + + public void setPruneFactor(int pruneFactor) { + this.pruneFactor = pruneFactor; + } + + public boolean shouldErrorCorrectKmers() { + return errorCorrectKmers; + } + + public void setErrorCorrectKmers(boolean errorCorrectKmers) { + this.errorCorrectKmers = errorCorrectKmers; + } + + public void setGraphWriter(PrintStream graphWriter) { + this.graphWriter = graphWriter; + } + + public byte getMinBaseQualityToUseInAssembly() { + return minBaseQualityToUseInAssembly; + } + + public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) { + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; + } + + public boolean isDebug() { + return debug; + } + + public void setDebug(boolean debug) { + this.debug = debug; + } + + public boolean isAllowCyclesInKmerGraphToGeneratePaths() { + return allowCyclesInKmerGraphToGeneratePaths; + } + + public void setAllowCyclesInKmerGraphToGeneratePaths(boolean allowCyclesInKmerGraphToGeneratePaths) { + this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths; + } + + public boolean isDebugGraphTransformations() { + return debugGraphTransformations; + } + + public void setDebugGraphTransformations(boolean debugGraphTransformations) { + this.debugGraphTransformations = debugGraphTransformations; + } + + public boolean isRecoverDanglingTails() { + return recoverDanglingTails; + } + + public void setRecoverDanglingTails(boolean recoverDanglingTails) { + this.recoverDanglingTails = recoverDanglingTails; + } + + public boolean isRecoverDanglingHeads() { + return recoverDanglingHeads; + } + + public void setRecoverDanglingHeads(boolean recoverDanglingHeads) { + this.recoverDanglingHeads = recoverDanglingHeads; + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java new file mode 100644 index 000000000..55a1c5dba --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -0,0 +1,620 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.*; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.*; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; + +public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine { + private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class); + + public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual + + private final byte constantGCP; + private final double log10globalReadMismappingRate; + private final boolean DEBUG; + + private final PairHMM.HMM_IMPLEMENTATION hmmType; + private final boolean noFpga; + + private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() { + @Override + protected PairHMM initialValue() { + switch (hmmType) { + case EXACT: return new Log10PairHMM(true); + case ORIGINAL: return new Log10PairHMM(false); + case LOGLESS_CACHING: + if (noFpga || !CnyPairHMM.isAvailable()) + return new LoglessPairHMM(); + else + return new CnyPairHMM(); + case ARRAY_LOGLESS: + if (noFpga || !CnyPairHMM.isAvailable()) + return new ArrayLoglessPairHMM(); + else + return new CnyPairHMM(); + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS."); + } + } + }; +// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException +// private final PairHMM pairHMM = pairHMMThreadLocal.get(); + + private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; + private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; + private final PrintStream likelihoodsStream; + + public enum PCR_ERROR_MODEL { + /** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */ + NONE, + /** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */ + AGGRESSIVE, + /** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */ + CONSERVATIVE + } + + private final PCR_ERROR_MODEL pcrErrorModel; + + /** + * The expected rate of random sequencing errors for a read originating from its true haplotype. + * + * For example, if this is 0.01, then we'd expect 1 error per 100 bp. + */ + private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02; + + /** + * Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations + * + * @param constantGCP the gap continuation penalty to use with the PairHMM + * @param debug should we emit debugging information during the calculation? + * @param hmmType the type of the HMM to use + * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of + * -3 means that the chance that a read doesn't actually belong at this + * location in the genome is 1 in 1000. The effect of this parameter is + * to cap the maximum likelihood difference between the reference haplotype + * and the best alternative haplotype by -3 log units. So if the best + * haplotype is at -10 and this parameter has a value of -3 then even if the + * reference haplotype gets a score of -100 from the pairhmm it will be + * assigned a likelihood of -13. + * @param noFpga disable FPGA acceleration + */ + public PairHMMLikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) { + this.hmmType = hmmType; + this.constantGCP = constantGCP; + this.DEBUG = debug; + this.log10globalReadMismappingRate = log10globalReadMismappingRate; + this.noFpga = noFpga; + this.pcrErrorModel = pcrErrorModel; + + initializePCRErrorModel(); + + if ( WRITE_LIKELIHOODS_TO_FILE ) { + try { + likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME))); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } + } else { + likelihoodsStream = null; + } + } + + public void close() { + if ( likelihoodsStream != null ) likelihoodsStream.close(); + } + + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ + if ( WRITE_LIKELIHOODS_TO_FILE ) { + likelihoodsStream.printf("%s %s %s %s %s %s %f%n", + haplotype.getBaseString(), + new String(processedRead.getReadBases() ), + SAMUtils.phredToFastq(processedRead.getBaseQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), + SAMUtils.phredToFastq(constantGCP), + log10l); + } + } + + private Map createAlleleMap(List haplotypes){ + final int numHaplotypes = haplotypes.size(); + final Map alleleMap = new LinkedHashMap<>(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + final Allele allele = Allele.create(haplotype, true); + alleleMap.put(allele, haplotype); + } + return alleleMap; + } + + private Map fillGCPArrays(List reads){ + final Map GCPArrayMap = new LinkedHashMap<>(); + for (GATKSAMRecord read: reads){ + byte [] GCPArray = new byte[read.getReadBases().length]; + Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + GCPArrayMap.put(read, GCPArray); + } + return GCPArrayMap; + } + + private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { + readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); + } + } + + /** + * Pre-processing of the reads to be evaluated at the current location from the current sample. + * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read. + * Modified copies of reads are packed into a new list, while original reads are retained for downstream use + * + * @param reads The original list of unmodified reads + * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding + */ + private List modifyReadQualities(final List reads) { + List processedReads = new LinkedList<>(); + for ( GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); + + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals); + + // Create a new copy of the read and sets its base qualities to the modified versions. + // Pack this into a new list for return + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); + processedReads.add(processedRead); + } + return processedReads; + } + + /** + * Post-processing of the read/allele likelihoods. + * + * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. + * We wish to return a map containing the original, unmodified reads. + * + * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. + * This protects us from the case where the assembly has produced haplotypes + * that are very divergent from reference, but are supported by only one read. In effect + * we capping how badly scoring the reference can be for any read by the chance that the read + * itself just doesn't belong here + * + * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls + * @param reads Our original, unmodified reads + * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls + * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype + * + * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls + */ + private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ + + // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls + final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + Allele refAllele = null; + final int numReads = reads.size(); + for (int readIndex = 0; readIndex < numReads; readIndex++) { + + // Get the original and quality-modified read from their respective lists + // Note that this requires both lists to have reads in the same order + final GATKSAMRecord originalRead = reads.get(readIndex); + final GATKSAMRecord processedRead = processedReads.get(readIndex); + + // keep track of the reference likelihood and the best non-ref likelihood + double refLog10l = Double.NEGATIVE_INFINITY; + double bestNonReflog10L = Double.NEGATIVE_INFINITY; + + for ( Allele allele : alleleHaplotypeMap.keySet() ) { + final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + if ( haplotype.isNonReference() ) + bestNonReflog10L = Math.max(bestNonReflog10L, log10l); + else { + refAllele = allele; + refLog10l = log10l; + } + writeDebugLikelihoods(processedRead, haplotype, log10l); + + // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l + processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); + } + + // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global + // mismapping rate. This protects us from the case where the assembly has produced haplotypes + // that are very divergent from reference, but are supported by only one read. In effect + // we capping how badly scoring the reference can be for any read by the chance that the read + // itself just doesn't belong here + final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; + if ( refLog10l < (worstRefLog10Allowed) ) { + processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed); + } + } + return processedReadAlleleLikelihoodMap; + } + + /** + * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate + * + * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples + * against the set of haplotypes + * + * @param haplotypes a non-null list of haplotypes + * @param perSampleReadList a mapping from sample -> reads + */ + private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { + int X_METRIC_LENGTH = 0; + for( final Map.Entry> sample : perSampleReadList.entrySet() ) { + for( final GATKSAMRecord read : sample.getValue() ) { + final int readLength = read.getReadLength(); + if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } + } + } + int Y_METRIC_LENGTH = 0; + for( final Haplotype h : haplotypes ) { + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } + } + + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + } + + + @Override + public Map computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList ) { + + final List haplotypes = assemblyResultSet.getHaplotypeList(); + // configure the HMM + initializePairHMM(haplotypes, perSampleReadList); + + // Add likelihoods for each sample's reads to our stratifiedReadMap + final Map stratifiedReadMap = new LinkedHashMap<>(); + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { + // evaluate the likelihood of the reads given those haplotypes + final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); + + map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); + stratifiedReadMap.put(sampleEntry.getKey(), map); + } + + return stratifiedReadMap; + } + + + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { + + // Add likelihoods for each sample's reads to our stratifiedReadMap + final Map stratifiedReadMap = new LinkedHashMap<>(); + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { + // evaluate the likelihood of the reads given those haplotypes + final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); + + map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); + stratifiedReadMap.put(sampleEntry.getKey(), map); + } + + return stratifiedReadMap; + } + + private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { + + // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities + List processedReads = modifyReadQualities(reads); + + // Get alleles corresponding to our haplotypees + Map alleleHaplotypeMap = createAlleleMap(haplotypes); + + // Get an array containing the constantGCP for each read in our modified read list + Map GCPArrayMap = fillGCPArrays(processedReads); + + // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); + + // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate + + return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); + } + + @Requires({"alleleOrdering.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, + final Map stratifiedReadMap, + final List alleleOrdering, + final boolean normalize ) { + return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); + } + + @Requires({"alleleOrdering.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, + final Map stratifiedReadMap, + final List alleleOrdering, + final boolean normalize) { + + final int numHaplotypes = alleleOrdering.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Allele iii_allele = alleleOrdering.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleleOrdering.get(jjj); + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); + } + } + haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; + } + } + + // normalize the diploid likelihoods matrix + return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix; + } + + @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) + @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) + protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { + final int numHaplotypes = likelihoodMatrix.length; + double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; + int index = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ){ + genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj]; + } + } + genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); + index = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ){ + likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++]; + } + } + return likelihoodMatrix; + } + + // -------------------------------------------------------------------------------- + // + // System to compute the best N haplotypes for genotyping + // + // -------------------------------------------------------------------------------- +// +// /** +// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele +// * @param map an annoying map object that moves us between the allele and haplotype representation +// * @param haplotypeAsAllele the allele version of the haplotype +// * @return the haplotype version, with its score incremented by 1 if its non-reference +// */ +// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { +// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic +// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value +// return h; +// } +// +// /** +// * Take the best N haplotypes and return them as a list +// * +// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample +// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing +// * order of score (so higher score haplotypes are preferred). The N we take is determined by +// * +// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) +// * +// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is +// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound +// * +// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 +// * @param nSamples the number of samples used to select the haplotypes +// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples +// * @return a list of N or fewer haplotypes, with the reference haplotype first +// */ +// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { +// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); +// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); +// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; +// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); +// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); +// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); +// return bestHaplotypes; +// } +// +// /** +// * Select the best haplotypes for genotyping the samples in stratifiedReadMap +// * +// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely +// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for +// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get +// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation +// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the +// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. +// * +// * @param haplotypes a list of all haplotypes we're considering +// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype +// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes +// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation +// */ +// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { +// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); +// +// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes +// +// // all of the haplotypes that at least one sample called as one of the most likely +// final Set selectedHaplotypes = new HashSet<>(); +// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected +// +// // our annoying map from allele -> haplotype +// final Map allele2Haplotype = new HashMap<>(); +// for ( final Haplotype h : haplotypes ) { +// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes +// allele2Haplotype.put(Allele.create(h, h.isReference()), h); +// } +// +// // for each sample, compute the most likely pair of haplotypes +// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { +// // get the two most likely haplotypes under a diploid model for this sample +// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); +// +// if ( mla != null ) { // there was something to evaluate in this sample +// // note that there must be at least 2 haplotypes +// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); +// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); +// +//// if ( DEBUG ) { +//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +//// } +// +// // add these two haplotypes to the set of haplotypes that have been selected +// selectedHaplotypes.add(best); +// selectedHaplotypes.add(second); +// +// // we've already selected all of our haplotypes, and we don't need to prune them down +// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) +// break; +// } +// } +// +// // take the best N haplotypes forward, in order of the number of samples that choose them +// final int nSamples = stratifiedReadMap.size(); +// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); +// +// if ( DEBUG ) { +// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); +// for ( final Haplotype h : bestHaplotypes ) { +// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); +// } +// } +// return bestHaplotypes; +// } +// +// /** +// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found +// * @param haplotypes non-null list of haplotypes +// * @return the reference haplotype +// */ +// private static Haplotype findReferenceHaplotype( final List haplotypes ) { +// for( final Haplotype h : haplotypes ) { +// if( h.isReference() ) return h; +// } +// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); +// } + + // -------------------------------------------------------------------------------- + // + // Experimental attempts at PCR error rate modeling + // + // -------------------------------------------------------------------------------- + + protected static final int MAX_STR_UNIT_LENGTH = 8; + protected static final int MAX_REPEAT_LENGTH = 20; + protected static final int MIN_ADJUSTED_QSCORE = 10; + protected static final double INITIAL_QSCORE = 40.0; + + private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1]; + private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + + private void initializePCRErrorModel() { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH); + + pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1]; + + final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0; + + for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ ) + pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor); + } + + protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) { + return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 )); + } + + protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + for ( int iii = 1; iii < readBases.length; iii++ ) { + final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond(); + readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java new file mode 100644 index 000000000..d5f62a6a3 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java @@ -0,0 +1,112 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Comparator; + +/** + * A pair read-likelihood (cost). + */ +public class ReadCost { + public final GATKSAMRecord read; + + /** + * Holds the cost value. Public for convenience, please use with care. + */ + private double cost; + + /** + * Create a new read cost object provided the read and the gap extension penalty. + * + * @param r the read. + * @param initialCost the initial cost for the read before any read-segment alignment. + * + * @throws NullPointerException if {@code r} is {@code null}. + * @throws IllegalArgumentException if {@code initialCost} is not a valid likelihood. + */ + public ReadCost(final GATKSAMRecord r, final double initialCost) { + if (r == null) throw new NullPointerException(); + if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0) + throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")"); + read = r; + cost = initialCost; + } + + + /** + * Comparator used to sort ReadCosts + */ + public static final Comparator COMPARATOR = new Comparator() { + @Override + public int compare(final ReadCost o1, final ReadCost o2) { + final String s1 = o1.read.getReadName() + (o1.read.getReadPairedFlag() ? (o1.read.getFirstOfPairFlag() ? "/1" : "/2") : ""); + final String s2 = o2.read.getReadName() + (o2.read.getReadPairedFlag() ? (o2.read.getFirstOfPairFlag() ? "/1" : "/2") : ""); + return s1.compareTo(s2); + } + }; + + + /** + * Add to the cost. + * @param value value to add. + */ + public void addCost(final double value) { + if (cost + value > 0) + throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value); + cost += value; + } + + /** + * Return cost. + * @return 0 or less. + */ + public double getCost() { + return cost; + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java new file mode 100644 index 000000000..a48ac9ee0 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java @@ -0,0 +1,522 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * Utility class that error-corrects reads. + * Main idea: An error in a read will appear as a bubble in a k-mer (de Bruijn) graph and such bubble will have very low multiplicity. + * Hence, read errors will appear as "sparse" kmers with very little support. + * Historically, the most common approach to error-correct reads before assembly has been to first compute the kmer spectrum of the reads, + * defined as the kmer composition of a set of reads along with the multiplicity of each kmer. + * First-generation correctors like the Euler corrector (Pevzner 2001) mapped low frequency kmers (kmers appearing say below N times) + * into high frequency ones that lied within a certain Hamming or edit distance. + * This is doable, but has some drawbacks: + * - Kmers used for error correction become tied to kmers used for graph building. + * - Hence, large kmers (desireable for graph building because they can resolve repeats better) are a hindrance for error correction, + * because they are seen less often. + * - After error correction, there is no guarantee that a sequence of kmers corresponds to an "actual" read. + * + * An error-corrected set of reads also makes a much smoother graph without the need to resolving so many bubbles. + * + * Idea hence is to correct reads based on their kmer content, but in a context independent from graph building. + * In order to do this, the following steps are taken: + * - The k-mer spectrum of a set of reads in computed. However, we are at freedom to choose the most convenient k-mer size (typicially around + * read length /2). + * - We partition the set of observed k-mers into "solid" kmers which have multiplicity > M, and "insolid" ones otherwise (Pevzner 2001). + * + * - Main idea of the algorithm is to try to substitute a sequence of bases in a read by a sequence better supported by kmers. + * - For each "unsolid" kmer observed in reads, we try to find a "solid" kmer within a maximum Hamming distance. + * - If such solid kmer exists, then this unsolid kmer is "correctable", otherwise, uncorrectable. + * - For each read, then: + * -- Walk through read and visit all kmers. + * -- If kmer is solid, continue to next kmer. + * -- If not, and if it's correctable (i.e. there exists a mapping from an unsolid kmer to a solid kmer within a given Hamming distance), + * add the bases and offsets corresponding to differing positions between unsolid and solid kmer to correction list. + * -- At the end, each base in read will have a list of corrections associated with it. We can then choose to correct or not. + * If read has only consistent corrections, then we can correct base to common base in corrections. + * + * TODO: + * todo Q: WHAT QUALITY TO USE?? + * todo how do we deal with mate pairs? + * + * + + + */ +public class ReadErrorCorrector { + private final static Logger logger = Logger.getLogger(ReadErrorCorrector.class); + /** + * A map of for each kmer to its num occurrences in addKmers + */ + KMerCounter countsByKMer; + + Map kmerCorrectionMap = new HashMap<>(); + Map> kmerDifferingBases = new HashMap<>(); + private final int kmerLength; + private final boolean debug; + private final boolean trimLowQualityBases; + private final byte minTailQuality; + private final int maxMismatchesToCorrect; + private final byte qualityOfCorrectedBases; + private final int maxObservationsForKmerToBeCorrectable; + private final int maxHomopolymerLengthInRegion; + private final int minObservationsForKmerToBeSolid; + + // default values, for debugging + private final static boolean doInplaceErrorCorrection = false; // currently not used, since we want corrected reads to be used only for assembly + private final static int MAX_MISMATCHES_TO_CORRECT = 2; + private final static byte QUALITY_OF_CORRECTED_BASES = 30; // what's a reasonable value here? + private final static int MAX_OBSERVATIONS_FOR_KMER_TO_BE_CORRECTABLE = 1; + private final static boolean TRIM_LOW_QUAL_TAILS = false; + private final static boolean DONT_CORRECT_IN_LONG_HOMOPOLYMERS = false; + private final static int MAX_HOMOPOLYMER_THRESHOLD = 12; + + // debug counter structure + private final ReadErrorCorrectionStats readErrorCorrectionStats = new ReadErrorCorrectionStats(); + + /** + * Create a new kmer corrector + * + * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 + * @param maxMismatchesToCorrect e >= 0 + * @param qualityOfCorrectedBases Bases to be corrected will be assigned this quality + */ + public ReadErrorCorrector(final int kmerLength, + final int maxMismatchesToCorrect, + final int maxObservationsForKmerToBeCorrectable, + final byte qualityOfCorrectedBases, + final int minObservationsForKmerToBeSolid, + final boolean trimLowQualityBases, + final byte minTailQuality, + final boolean debug, + final byte[] fullReferenceWithPadding) { + if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); + if ( maxMismatchesToCorrect < 1 ) + throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect); + if ( qualityOfCorrectedBases < 2 || qualityOfCorrectedBases > QualityUtils.MAX_REASONABLE_Q_SCORE) + throw new IllegalArgumentException("qualityOfCorrectedBases must be >= 2 and <= MAX_REASONABLE_Q_SCORE but got " + qualityOfCorrectedBases); + + countsByKMer = new KMerCounter(kmerLength); + this.kmerLength = kmerLength; + this.maxMismatchesToCorrect = maxMismatchesToCorrect; + this.qualityOfCorrectedBases = qualityOfCorrectedBases; + this.minObservationsForKmerToBeSolid = minObservationsForKmerToBeSolid; + this.trimLowQualityBases = trimLowQualityBases; + this.minTailQuality = minTailQuality; + this.debug = debug; + this.maxObservationsForKmerToBeCorrectable = maxObservationsForKmerToBeCorrectable; + + // when region has long homopolymers, we may want not to correct reads, since assessment is complicated, + // so we may decide to skip error correction in these regions + maxHomopolymerLengthInRegion = computeMaxHLen(fullReferenceWithPadding); + } + + /** + * Simple constructor with sensible defaults + * @param kmerLength K-mer length for error correction (not necessarily the same as for assembly graph) + * @param minTailQuality Minimum tail quality: remaining bases with Q's below this value are hard-clipped after correction + * @param debug Output debug information + */ + public ReadErrorCorrector(final int kmerLength, final byte minTailQuality, final int minObservationsForKmerToBeSolid, final boolean debug,final byte[] fullReferenceWithPadding) { + this(kmerLength, MAX_MISMATCHES_TO_CORRECT, MAX_OBSERVATIONS_FOR_KMER_TO_BE_CORRECTABLE, QUALITY_OF_CORRECTED_BASES, minObservationsForKmerToBeSolid, TRIM_LOW_QUAL_TAILS, minTailQuality, debug,fullReferenceWithPadding); + } + + /** + * Main entry routine to add all kmers in a read to the read map counter + * @param read Read to add bases + */ + @Requires("read != null") + protected void addReadKmers(final GATKSAMRecord read) { + if (DONT_CORRECT_IN_LONG_HOMOPOLYMERS && maxHomopolymerLengthInRegion > MAX_HOMOPOLYMER_THRESHOLD) + return; + + final byte[] readBases = read.getReadBases(); + for (int offset = 0; offset <= readBases.length-kmerLength; offset++ ) { + countsByKMer.addKmer(new Kmer(readBases,offset,kmerLength),1); + + } + } + + /** + * Correct a collection of reads based on stored k-mer counts + * @param reads + */ + public final List correctReads(final Collection reads) { + + final List correctedReads = new ArrayList<>(reads.size()); + if (DONT_CORRECT_IN_LONG_HOMOPOLYMERS && maxHomopolymerLengthInRegion > MAX_HOMOPOLYMER_THRESHOLD) { + // just copy reads into output and exit + correctedReads.addAll(reads); + } + else { + computeKmerCorrectionMap(); + for (final GATKSAMRecord read: reads) { + final GATKSAMRecord correctedRead = correctRead(read); + if (trimLowQualityBases) + correctedReads.add(ReadClipper.hardClipLowQualEnds(correctedRead, minTailQuality)); + else + correctedReads.add(correctedRead); + } + if (debug) { + logger.info("Number of corrected bases:"+readErrorCorrectionStats.numBasesCorrected); + logger.info("Number of corrected reads:"+readErrorCorrectionStats.numReadsCorrected); + logger.info("Number of skipped reads:"+readErrorCorrectionStats.numReadsUncorrected); + logger.info("Number of solid kmers:"+readErrorCorrectionStats.numSolidKmers); + logger.info("Number of corrected kmers:"+readErrorCorrectionStats.numCorrectedKmers); + logger.info("Number of uncorrectable kmers:"+readErrorCorrectionStats.numUncorrectableKmers); + } + } + return correctedReads; + } + + + /** + * Do actual read correction based on k-mer map. First, loop through stored k-mers to get a list of possible corrections + * for each position in the read. Then correct read based on all possible consistent corrections. + * @param inputRead Read to correct + * @return Corrected read (can be same reference as input if doInplaceErrorCorrection is set) + */ + @Requires("inputRead != null") + private GATKSAMRecord correctRead(final GATKSAMRecord inputRead) { + // do actual correction + boolean corrected = false; + final byte[] correctedBases = inputRead.getReadBases(); + final byte[] correctedQuals = inputRead.getBaseQualities(); + + // array to store list of possible corrections for read + final CorrectionSet correctionSet = buildCorrectionMap(correctedBases); + + for (int offset = 0; offset < correctedBases.length; offset++) { + final Byte b = correctionSet.getConsensusCorrection(offset); + if (b != null && b != correctedBases[offset]) { + correctedBases[offset] = b; + correctedQuals[offset] = qualityOfCorrectedBases; + corrected = true; + } + readErrorCorrectionStats.numBasesCorrected++; + } + + if (corrected) { + readErrorCorrectionStats.numReadsCorrected++; + if (doInplaceErrorCorrection) { + inputRead.setReadBases(correctedBases); + inputRead.setBaseQualities(correctedQuals); + return inputRead; + } + else { + GATKSAMRecord correctedRead = new GATKSAMRecord(inputRead); + + // do the actual correction + // todo - do we need to clone anything else from read? + correctedRead.setBaseQualities(inputRead.getBaseQualities()); + correctedRead.setIsStrandless(inputRead.isStrandless()); + correctedRead.setReadBases(inputRead.getReadBases()); + correctedRead.setReadString(inputRead.getReadString()); + correctedRead.setReadGroup(inputRead.getReadGroup()); + return correctedRead; + } + } + else { + readErrorCorrectionStats.numReadsUncorrected++; + return inputRead; + } + } + + /** + * Build correction map for each of the bases in read. + * For each of the constituent kmers in read: + * a) See whether the kmer has been mapped to a corrected kmer. + * b) If so, get list of differing positions and corresponding bases. + * c) Add then list of new bases to index in correction list. + * Correction list is of read size, and holds a list of bases to correct. + * @param correctedBases Bases to attempt to correct + * @return CorrectionSet object. + */ + @Requires("correctedBases != null") + private CorrectionSet buildCorrectionMap(final byte[] correctedBases) { + // array to store list of possible corrections for read + final CorrectionSet correctionSet = new CorrectionSet(correctedBases.length); + + for (int offset = 0; offset <= correctedBases.length-kmerLength; offset++ ) { + final Kmer kmer = new Kmer(correctedBases,offset,kmerLength); + final Kmer newKmer = kmerCorrectionMap.get(kmer); + if (newKmer != null && !newKmer.equals(kmer)){ + final Pair differingPositions = kmerDifferingBases.get(kmer); + final int[] differingIndeces = differingPositions.first; + final byte[] differingBases = differingPositions.second; + + for (int k=0; k < differingIndeces.length; k++) { + // get list of differing positions for corrected kmer + // for each of these, add correction candidate to correction set + correctionSet.add(offset + differingIndeces[k],differingBases[k]); + } + } + } + return correctionSet; + } + + + /** + * Top-level entry point that adds a collection of reads to our kmer list. + * For each read in list, its constituent kmers will be logged in our kmer table. + * @param reads + */ + @Requires("reads != null") + public void addReadsToKmers(final Collection reads) { + for (final GATKSAMRecord read: reads) + addReadKmers(read); + + if (debug) + for ( final KMerCounter.CountedKmer countedKmer: countsByKMer.getCountedKmers() ) + logger.info(String.format("%s\t%d\n", countedKmer.kmer, countedKmer.count)); + } + + + /** + * For each kmer we've seen, do the following: + * a) If kmer count > threshold1, this kmer is good, so correction map will be to itself. + * b) If kmer count <= threshold2, this kmer is bad. + * In that case, loop through all other kmers. If kmer is good, compute distance, and get minimal distance. + * If such distance is < some threshold, map to this kmer, and record differing positions and bases. + * + */ + private void computeKmerCorrectionMap() { + for (final KMerCounter.CountedKmer storedKmer : countsByKMer.getCountedKmers()) { + if (storedKmer.getCount() >= minObservationsForKmerToBeSolid) { + // this kmer is good: map to itself + kmerCorrectionMap.put(storedKmer.getKmer(),storedKmer.getKmer()); + kmerDifferingBases.put(storedKmer.getKmer(),new Pair<>(new int[0],new byte[0])); // dummy empty array + readErrorCorrectionStats.numSolidKmers++; + } + else if (storedKmer.getCount() <= maxObservationsForKmerToBeCorrectable) { + // loop now thru all other kmers to find nearest neighbor + final Pair> nearestNeighbor = findNearestNeighbor(storedKmer.getKmer(),countsByKMer,maxMismatchesToCorrect); + + // check if nearest neighbor lies in a close vicinity. If so, log the new bases and the correction map + if (nearestNeighbor != null) { // ok, found close neighbor + kmerCorrectionMap.put(storedKmer.getKmer(), nearestNeighbor.first); + kmerDifferingBases.put(storedKmer.getKmer(), nearestNeighbor.second); + readErrorCorrectionStats.numCorrectedKmers++; +// if (debug) +// logger.info("Original kmer:"+storedKmer + "\tCorrected kmer:"+nearestNeighbor.first+"\tDistance:"+dist); + } + else + readErrorCorrectionStats.numUncorrectableKmers++; + + } + } + } + + /** + * Finds nearest neighbor of a given k-mer, among a list of counted K-mers, up to a given distance. + * If many k-mers share same closest distance, an arbitrary k-mer is picked + * @param kmer K-mer of interest + * @param countsByKMer KMerCounter storing set of counted k-mers (may include kmer of interest) + * @param maxDistance Maximum distance to search + * @return Pair of values: closest K-mer in Hamming distance and list of differing bases. + * If no neighbor can be found up to given distance, returns null + */ + @Requires({"kmer != null", "countsByKMer != null","maxDistance >= 1"}) + private Pair> findNearestNeighbor(final Kmer kmer, + final KMerCounter countsByKMer, + final int maxDistance) { + int minimumDistance = Integer.MAX_VALUE; + Kmer closestKmer = null; + + final int[] differingIndeces = new int[maxDistance+1]; + final byte[] differingBases = new byte[maxDistance+1]; + + final int[] closestDifferingIndices = new int[maxDistance+1]; + final byte[] closestDifferingBases = new byte[maxDistance+1]; + + for (final KMerCounter.CountedKmer candidateKmer : countsByKMer.getCountedKmers()) { + // skip if candidate set includes test kmer + if (candidateKmer.getKmer().equals(kmer)) + continue; + + final int hammingDistance = kmer.getDifferingPositions(candidateKmer.getKmer(), maxDistance, differingIndeces, differingBases); + if (hammingDistance < 0) // can't compare kmer? skip + continue; + + if (hammingDistance < minimumDistance) { + minimumDistance = hammingDistance; + closestKmer = candidateKmer.getKmer(); + System.arraycopy(differingBases,0,closestDifferingBases,0,differingBases.length); + System.arraycopy(differingIndeces,0,closestDifferingIndices,0,differingIndeces.length); + } + } + return new Pair<>(closestKmer, new Pair<>(closestDifferingIndices,closestDifferingBases)); + } + + + /** + * experimental function to compute max homopolymer length in a given reference context + * @param fullReferenceWithPadding Reference context of interest + * @return Max homopolymer length in region + */ + @Requires("fullReferenceWithPadding != null") + private static int computeMaxHLen(final byte[] fullReferenceWithPadding) { + + int leftRun = 1; + int maxRun = 1; + for ( int i = 1; i < fullReferenceWithPadding.length; i++) { + if ( fullReferenceWithPadding[i] == fullReferenceWithPadding[i-1] ) + leftRun++; + else + leftRun = 1; + } + if (leftRun > maxRun) + maxRun = leftRun; + + + return maxRun; + } + + private static final class ReadErrorCorrectionStats { + public int numReadsCorrected; + public int numReadsUncorrected; + public int numBasesCorrected; + public int numSolidKmers; + public int numUncorrectableKmers; + public int numCorrectedKmers; + } + + /** + * Wrapper utility class that holds, for each position in read, a list of bytes representing candidate corrections. + * So, a read ACAGT where the middle A has found to be errorful might look like: + * 0: {} + * 1: {} + * 2: {'C','C','C'} + * 3: {} + * 4: {} + * + * It's up to the method getConsensusCorrection() to decide how to use the correction sets for each position. + * By default, only strict consensus is allowed right now. + * + */ + protected static class CorrectionSet { + private final int size; + private ArrayList> corrections; + + /** + * Main class constructor. + * @param size Size of correction set, needs to be set equal to the read being corrected + */ + public CorrectionSet(final int size) { + this.size = size; + corrections = new ArrayList<>(size); + for (int k=0; k < size; k++) + corrections.add(k,new ArrayList()); + } + + /** + * Add a base to this correction set at a particular offset, measured from the start of the read + * @param offset Offset from start of read + * @param base base to be added to list of corrections at this offset + */ + public void add(final int offset, final byte base) { + if (offset >= size || offset < 0) + throw new IllegalStateException("Bad entry into CorrectionSet: offset > size"); + if (!BaseUtils.isRegularBase(base)) + return; // no irregular base correction + + final List storedBytes = corrections.get(offset); + storedBytes.add(base); + } + + /** + * Get list of corrections for a particular offset + * @param offset Offset of interest + * @return List of bases representing possible corrections at this offset + */ + public List get(final int offset) { + if (offset >= size || offset < 0) + throw new IllegalArgumentException("Illegal call of CorrectionSet.get(): offset must be < size"); + return corrections.get(offset); + } + + /** + * Get consensus correction for a particular offset. In this implementation, it just boils down to seeing if + * byte list associated with offset has identical values. If so, return this base, otherwise return null. + * @param offset + * @return Consensus base, or null if no consensus possible. + */ + public Byte getConsensusCorrection(final int offset) { + if (offset >= size || offset < 0) + throw new IllegalArgumentException("Illegal call of CorrectionSet.getConsensusCorrection(): offset must be < size"); + final List storedBytes = corrections.get(offset); + if (storedBytes.isEmpty()) + return null; + + // todo - is there a cheaper/nicer way to compare if all elements in list are identical?? + final byte lastBase = storedBytes.remove(storedBytes.size()-1); + for (final Byte b: storedBytes) { + // strict correction rule: all bases must match + if (b != lastBase) + return null; + } + + // all bytes then are equal: + return lastBase; + + } + + + + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java new file mode 100644 index 000000000..b5544f1a2 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java @@ -0,0 +1,120 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * Path cost indicate the cost (alignment likelihood) of traversing a section of the graph using a segement of a read. + * + *

A path can be a whole haplotype path as well as just a smaller haplotype segment

. + * + *

We would generate many of this objects for each read. The final likelihood of a read vs each haplotype + * would be the summation of the path-cost of that read along the corresponding haplotype path.

+ */ +class ReadSegmentCost { + + public Route path; + public GATKSAMRecord read; + + /** + * Holds the cost value. It public and non-final for convenience. + */ + private double cost; + + /** + * Caches the path bases (the haplotype segment bases). + */ + protected byte[] bases; + + /** + * Construct a new path cost. + * @param read the corresponding read. + * @param path the corresponding path. + * @param cost initial cost estimate. Might be updated later. + */ + @Requires("route != null") + public ReadSegmentCost(final GATKSAMRecord read, + final Route path, double cost) { + this.read = read; + this.path = path; + setCost(cost); + } + + public double getCost() { + return cost; + } + + public void setCost(final double value) { + cost = value; + } + + /** + * Used to generate unique identifiers for path cost object. + */ + private static final AtomicLong pathCostUniqueIdGenerator = new AtomicLong(); + + /** + * Holds the path cost unique identifier. + */ + private Long uniqueId; + + /** + * Returns the this path-cost unique identifier. + * @return + */ + public long uniqueId() { + if (uniqueId == null) + uniqueId = pathCostUniqueIdGenerator.incrementAndGet(); + return uniqueId; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java new file mode 100644 index 000000000..5ef310498 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -0,0 +1,514 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination; +import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine; + +import java.io.File; +import java.util.*; + +/** + * Code for estimating the reference confidence + * + * This code can estimate the probability that the data for a single sample is consistent with a + * well-determined REF/REF diploid genotype. + * + * User: depristo + * Date: 6/21/13 + * Time: 12:52 PM + */ +public class ReferenceConfidenceModel { + + //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now + public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele + + private final GenomeLocParser genomeLocParser; + private final Set samples; + private final SAMFileHeader header; // TODO -- really shouldn't depend on this + private final int indelInformativeDepthIndelSize; + + private final static boolean WRITE_DEBUGGING_BAM = false; + private final SAMFileWriter debuggingWriter; + + private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; + + /** + * Create a new ReferenceConfidenceModel + * + * @param genomeLocParser how we create genome locs + * @param samples the list of all samples we'll be considering with this model + * @param header the SAMFileHeader describing the read information (used for debugging) + * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths + */ + public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, + final Set samples, + final SAMFileHeader header, + final int indelInformativeDepthIndelSize) { + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + if ( samples == null ) throw new IllegalArgumentException("samples cannot be null"); + if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty"); + if ( header == null ) throw new IllegalArgumentException("header cannot be empty"); + if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize); + + this.genomeLocParser = genomeLocParser; + this.samples = samples; + this.header = header; + this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; + + if ( WRITE_DEBUGGING_BAM ) { + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(true); + debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam")); + } else { + debuggingWriter = null; + } + + initializeIndelPLCache(); + } + + /** + * Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence + * @return a non-null set of VCFHeaderLines + */ + public Set getVCFHeaderLines() { + final Set headerLines = new LinkedHashSet<>(); + // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? + headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); + //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + return headerLines; + } + + /** + * Close down this reference model, closing down any debugging information opened during execution + */ + public void close() { + if ( debuggingWriter != null ) debuggingWriter.close(); + } + + + /** + * Calculate the reference confidence for a single sample given the its read data + * + * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing + * detailed information about the certainty that the sample is hom-ref for each base in the region. + * + * + * + * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() + * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the + * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. + * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) + * @param activeRegion the active region we want to get the reference confidence over + * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes + * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the + * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence + * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) + * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence + * contexts as well as calls from variantCalls if any were provided + */ + public List calculateRefConfidence(final Haplotype refHaplotype, + final Collection calledHaplotypes, + final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, + final Map stratifiedReadMap, + final List variantCalls) { + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); + + final GenomeLoc refSpan = activeRegion.getLocation(); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); + final byte[] ref = refHaplotype.getBases(); + final List results = new ArrayList<>(refSpan.size()); + final String sampleName = stratifiedReadMap.keySet().iterator().next(); + + final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); + for ( final ReadBackedPileup pileup : refPileups ) { + final GenomeLoc curPos = pileup.getLocation(); + final int offset = curPos.getStart() - refSpan.getStart(); + + final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls); + if ( overlappingSite != null ) { + // we have some overlapping site, add it to the list of positions + if ( overlappingSite.getStart() == curPos.getStart() ) + results.add(overlappingSite); + } else { + // otherwise emit a reference confidence variant context + final int refOffset = offset + globalRefOffset; + final byte refBase = ref[refOffset]; + final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + homRefCalc.capByHomRefLikelihood(); + + final Allele refAllele = Allele.create(refBase, true); + final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); + gb.AD(homRefCalc.AD_Ref_Any); + gb.DP(homRefCalc.getDP()); + + // genotype likelihood calculation + final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods); + final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize); + final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads); + + // now that we have the SNP and indel GLs, we take the one with the least confidence, + // as this is the most conservative estimate of our certainty that we are hom-ref. + // For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000 + // we are very certain that there's no indel here, but the SNP confidence imply that we are + // far less confident that the ref base is actually the only thing here. So we take 0,10,100 + // as our GLs for the site. + final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs); + + gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); + gb.PL(leastConfidenceGLs.getAsPLs()); + //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); + + vcb.genotypes(gb.make()); + results.add(vcb.make()); +// logger.info(" => VariantContext " + vcb.make()); + } + } + + return results; + } + + /** + * Get the GenotypeLikelihoods with the least strong corresponding GQ value + * @param gl1 first to consider (cannot be null) + * @param gl2 second to consider (cannot be null) + * @return gl1 or gl2, whichever has the worst GQ + */ + protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) { + return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2; + } + + /** + * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site + * + * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site + * @return non-null GenotypeLikelihoods given N + */ + protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { + return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; + } + + protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway + private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + + private void initializeIndelPLCache() { + for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { + final double homRef = 0.0; + final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; + final double homVar = INDEL_ERROR_RATE * nInformativeReads; + indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + } + } + + /** + * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt + * + * @param pileup the read backed pileup containing the data we want to evaluate + * @param refBase the reference base at this pileup position + * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation + * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips + * @return a RefVsAnyResult genotype call + */ + public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { + final RefVsAnyResult result = new RefVsAnyResult(); + + for( final PileupElement p : pileup ) { + final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); + if( p.isDeletion() || qual > minBaseQual ) { + int AA = 0; final int AB = 1; int BB = 2; + if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + AA = 2; + BB = 0; + if( hqSoftClips != null && p.isNextToSoftClip() ) { + hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); + } + result.AD_Ref_Any[1]++; + } else { + result.AD_Ref_Any[0]++; + } + result.genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual); + result.genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); + result.genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; + } + } + + return result; + } + + /** + * Get a list of pileups that span the entire active region span, in order, one for each position + */ + private List getPileupsOverReference(final Haplotype refHaplotype, + final Collection calledHaplotypes, + final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, + final GenomeLoc activeRegionSpan, + final Map stratifiedReadMap) { + + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + + List realignedReads; + + if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner + realignedReads = activeRegion.getReads(); + } else { + final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); + final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); + writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves + writer.setOnlyRealignInformativeReads(true); + writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); + realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + } + + if ( debuggingWriter != null ) + for ( final GATKSAMRecord read : realignedReads ) + debuggingWriter.addAlignment(read); + + final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, + true, genomeLocParser, samples, false); + + final List pileups = new LinkedList<>(); + final int startPos = activeRegionSpan.getStart(); + AlignmentContext next = libs.advanceToLocus(startPos, true); + for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) { + if ( next != null && next.getLocation().getStart() == curPos ) { + pileups.add(next.getBasePileup()); + next = libs.hasNext() ? libs.next() : null; + } else { + // no data, so we create empty pileups + pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos))); + } + } + + return pileups; + } + + /** + * Return the rightmost variant context in maybeOverlapping that overlaps curPos + * + * @param curPos non-null genome loc + * @param maybeOverlapping a collection of variant contexts that might overlap curPos + * @return a VariantContext, or null if none overlaps + */ + protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection maybeOverlapping) { + VariantContext overlaps = null; + for ( final VariantContext vc : maybeOverlapping ) { + if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) { + if ( overlaps == null || vc.getStart() > overlaps.getStart() ) { + overlaps = vc; + } + } + } + return overlaps; + } + + /** + * Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart + * assuming no insertions or deletions in the read w.r.t. the reference + * + * @param readBases non-null bases of the read + * @param readQuals non-null quals of the read + * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) + * @param refBases the reference bases + * @param refStart the offset into refBases that aligns to the readStart position in readBases + * @param maxSum if the sum goes over this value, return immediately + * @return the sum of quality scores for readBases that mismatch their corresponding ref bases + */ + protected final int sumMismatchingQualities(final byte[] readBases, + final byte[] readQuals, + final int readStart, + final byte[] refBases, + final int refStart, + final int maxSum) { + final int n = Math.min(readBases.length - readStart, refBases.length - refStart); + int sum = 0; + + for ( int i = 0; i < n; i++ ) { + final byte readBase = readBases[readStart + i]; + final byte refBase = refBases[refStart + i]; + if ( readBase != refBase ) { + sum += readQuals[readStart + i]; + if ( sum > maxSum ) // abort early + return sum; + } + } + + return sum; + } + + /** + * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart + * + * @param readBases non-null bases of the read + * @param readQuals non-null quals of the read + * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) + * @param refBases the reference bases + * @param refStart the offset into refBases that aligns to the readStart position in readBases + * @param maxIndelSize the max indel size to consider for the read to be informative + * @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart + */ + protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases, + final byte[] readQuals, + final int readStart, + final byte[] refBases, + final int refStart, + final int maxIndelSize) { + // fast exit when n bases left < maxIndelSize + if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { + return false; + } + + final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); + + // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion + // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores + for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) { + for ( final boolean checkInsertion : Arrays.asList(true, false) ) { + final int readI, refI; + if ( checkInsertion ) { + readI = readStart + indelSize; + refI = refStart; + } else { + readI = readStart; + refI = refStart + indelSize; + } + + final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum); + if ( score <= baselineMMSum ) + return false; + } + } + + return true; + } + + /** + * Calculate the number of indel informative reads at pileup + * + * @param pileup a pileup + * @param pileupOffsetIntoRef the position of the pileup in the reference + * @param ref the ref bases + * @param maxIndelSize maximum indel size to consider in the informativeness calculation + * @return an integer >= 0 + */ + protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) { + int nInformative = 0; + for ( final PileupElement p : pileup ) { + final GATKSAMRecord read = p.getRead(); + final int offset = p.getOffset(); + + // doesn't count as evidence + if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) + continue; + + // todo -- this code really should handle CIGARs directly instead of relying on the above tests + if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { + nInformative++; + if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { + return MAX_N_INDEL_INFORMATIVE_READS; + } + } + } + return nInformative; + } + + /** + * Create a reference haplotype for an active region + * + * @param activeRegion the active region + * @param refBases the ref bases + * @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation() + * @return a reference haplotype + */ + public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) { + final Haplotype refHaplotype = new Haplotype(refBases, true); + final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart(); + if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart); + refHaplotype.setAlignmentStartHapwrtRef(alignmentStart); + final Cigar c = new Cigar(); + c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); + refHaplotype.setCigar(c); + return refHaplotype; + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java new file mode 100644 index 000000000..8fba6c9d5 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java @@ -0,0 +1,194 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.PriorityQueue; + +/** + * K-best sub-haplotype finder that selects the best solutions out of a collection of sub-haplotype finders. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { + + /** + * Collection of subFinders that provided the actual solutions. + */ + private final Collection subFinders; + + /** + * Flag indicating whether the sub-finders have been processed or not. + */ + private boolean processedSubFinders = false; + + /** + * Holds the number of k-best solution that this finder would ever return. + */ + private int count = 0; + + /** + * Holds the best {@code i} paths to the sink so far calculated where {@code i+1} is the length of this list. + * + *

As more results are requested the array will grow. All positions and solutions are + * calculated up to {@code i}

. + */ + private ArrayList rankedSubHaplotype; + + /** + * Priority queue with next best haplotype solution from each sub-finder; previous ones are + * already part {@link #rankedSubHaplotype}. + */ + private PriorityQueue nextBestSubHaplotypes; + + /** + * Creates a new aggregated sub-haplotype finder given its sub-finders. + * @param finders set of sub-finders. + */ + public AggregatedSubHaplotypeFinder(final Collection finders) { + if (finders == null) throw new IllegalArgumentException("finder collection cannot be null"); + this.subFinders = finders; + } + + @Override + public int getCount() { + processSubFindersIfNeeded(); + return count; + } + + private void processSubFindersIfNeeded() { + if (processedSubFinders) return; + + long count = 0; + nextBestSubHaplotypes = new PriorityQueue<>(subFinders.size()); + for (final KBestSubHaplotypeFinder finder : subFinders) { + final int finderCount = finder.getCount(); + if (finderCount == 0) continue; + count += finderCount; + nextBestSubHaplotypes.add(new MyKBestHaplotypeResult(finder,0)); + } + + this.count = (int) Math.min(Integer.MAX_VALUE,count); + + rankedSubHaplotype = new ArrayList<>(10); + processedSubFinders = true; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) throw new IllegalArgumentException("k cannot be negative"); + processSubFindersIfNeeded(); + if (k >= count) throw new IllegalArgumentException("k cannot be equal or greater than the count"); + if (k < rankedSubHaplotype.size()) + return rankedSubHaplotype.get(k); + + rankedSubHaplotype.ensureCapacity(k+1); + for (int i = rankedSubHaplotype.size(); i <= k; i++) { + // since k < possibleHaplotypeCount is guarantee no to be empty. + if (nextBestSubHaplotypes.isEmpty()) + throw new IllegalStateException("what the heck " + k + " " + count); + final MyKBestHaplotypeResult nextResult = nextBestSubHaplotypes.remove(); + nextResult.rank = i; + rankedSubHaplotype.add(nextResult); + final int subRank = nextResult.result.rank(); + + // if there is no further solution from the same child we cannot add another solution from that child. + if (subRank + 1 >= nextResult.subFinder.getCount()) + continue; + nextBestSubHaplotypes.add(new MyKBestHaplotypeResult(nextResult.subFinder, subRank + 1)); + } + return rankedSubHaplotype.get(k); + } + + /** + * Custom implementation of {@link KBestHaplotype} to encapsulate sub-finder results. + */ + private class MyKBestHaplotypeResult extends KBestHaplotype { + + private KBestSubHaplotypeFinder subFinder; + + private final KBestHaplotype result; + + private int rank; + + private MyKBestHaplotypeResult(final KBestSubHaplotypeFinder finder, final int rank) { + this.subFinder = finder; + this.result = finder.getKBest(rank); + this.rank = -1; + } + + @Override + public SeqGraph graph() { + return result.graph(); + } + + @Override + public int score() { + return result.score(); + } + + @Override + public boolean isReference() { + return result.isReference(); + } + + @Override + public int rank() { + return rank; + } + + @Override + protected SeqVertex head() { + return result.head(); + } + + @Override + protected KBestHaplotype tail() { + return result.tail(); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java new file mode 100644 index 000000000..36216bdd2 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -0,0 +1,715 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.jgrapht.EdgeFactory; +import org.jgrapht.graph.DefaultDirectedGraph; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 2/6/13 + */ +@Invariant("!this.isAllowingMultipleEdges()") +public class BaseGraph extends DefaultDirectedGraph { + protected final static Logger logger = Logger.getLogger(BaseGraph.class); + protected final int kmerSize; + + /** + * Construct a TestGraph with kmerSize + * @param kmerSize + */ + public BaseGraph(final int kmerSize, final EdgeFactory edgeFactory) { + super(edgeFactory); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); + this.kmerSize = kmerSize; + } + + /** + * How big of a kmer did we use to create this graph? + * @return + */ + public int getKmerSize() { + return kmerSize; + } + + /** + * @param v the vertex to test + * @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph) + */ + public boolean isReferenceNode( final V v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + + for ( final BaseEdge e : edgesOf(v) ) { + if ( e.isRef() ) { return true; } + } + + // edge case: if the graph only has one node then it's a ref node, otherwise it's not + return (vertexSet().size() == 1); + } + + /** + * @param v the vertex to test + * @return true if this vertex is a source node (in degree == 0) + */ + public boolean isSource( final V v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + return inDegreeOf(v) == 0; + } + + /** + * @param v the vertex to test + * @return true if this vertex is a sink node (out degree == 0) + */ + public boolean isSink( final V v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + return outDegreeOf(v) == 0; + } + + /** + * Get the set of source vertices of this graph + * @return a non-null set + */ + public Set getSources() { + final Set set = new LinkedHashSet(); + for ( final V v : vertexSet() ) + if ( isSource(v) ) + set.add(v); + return set; + } + + /** + * Get the set of sink vertices of this graph + * @return a non-null set + */ + public Set getSinks() { + final Set set = new LinkedHashSet(); + for ( final V v : vertexSet() ) + if ( isSink(v) ) + set.add(v); + return set; + } + + /** + * Pull out the additional sequence implied by traversing this node in the graph + * @param v the vertex from which to pull out the additional base sequence + * @return non-null byte array + */ + @Ensures({"result != null"}) + public byte[] getAdditionalSequence( final V v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); } + return v.getAdditionalSequence(isSource(v)); + } + + /** + * @param v the vertex to test + * @return true if this vertex is a reference source + */ + public boolean isRefSource( final V v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + + // confirm that no incoming edges are reference edges + for ( final E edgeToTest : incomingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return false; } + } + + // confirm that there is an outgoing reference edge + for ( final E edgeToTest : outgoingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return true; } + } + + // edge case: if the graph only has one node then it's a ref sink, otherwise it's not + return (vertexSet().size() == 1); + } + + /** + * @param v the vertex to test + * @return true if this vertex is a reference sink + */ + public boolean isRefSink( final V v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + + // confirm that no outgoing edges are reference edges + for ( final E edgeToTest : outgoingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return false; } + } + + // confirm that there is an incoming reference edge + for ( final E edgeToTest : incomingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return true; } + } + + // edge case: if the graph only has one node then it's a ref source, otherwise it's not + return (vertexSet().size() == 1); + } + + /** + * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph + */ + public V getReferenceSourceVertex( ) { + for( final V v : vertexSet() ) { + if( isRefSource(v) ) { + return v; + } + } + return null; + } + + /** + * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph + */ + public V getReferenceSinkVertex( ) { + for( final V v : vertexSet() ) { + if( isRefSink(v) ) { + return v; + } + } + return null; + } + + /** + * Traverse the graph and get the next reference vertex if it exists + * @param v the current vertex, can be null + * @return the next reference vertex if it exists + */ + public V getNextReferenceVertex( final V v ) { + if( v == null ) { return null; } + for( final E edgeToTest : outgoingEdgesOf(v) ) { + if( edgeToTest.isRef() ) { + return getEdgeTarget(edgeToTest); + } + } + return null; + } + + /** + * Traverse the graph and get the previous reference vertex if it exists + * @param v the current vertex, can be null + * @return the previous reference vertex if it exists + */ + public V getPrevReferenceVertex( final V v ) { + if( v == null ) { return null; } + for( final E edgeToTest : incomingEdgesOf(v) ) { + if( isReferenceNode(getEdgeSource(edgeToTest)) ) { + return getEdgeSource(edgeToTest); + } + } + return null; + } + + /** + * Does a reference path exist between the two vertices? + * @param fromVertex from this vertex, can be null + * @param toVertex to this vertex, can be null + * @return true if a reference path exists in the graph between the two vertices + */ + public boolean referencePathExists(final V fromVertex, final V toVertex) { + V v = fromVertex; + if( v == null ) { + return false; + } + v = getNextReferenceVertex(v); + if( v == null ) { + return false; + } + while( !v.equals(toVertex) ) { + v = getNextReferenceVertex(v); + if( v == null ) { + return false; + } + } + return true; + } + + /** + * Walk along the reference path in the graph and pull out the corresponding bases + * @param fromVertex starting vertex + * @param toVertex ending vertex + * @param includeStart should the starting vertex be included in the path + * @param includeStop should the ending vertex be included in the path + * @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example) + */ + public byte[] getReferenceBytes( final V fromVertex, final V toVertex, final boolean includeStart, final boolean includeStop ) { + if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); } + if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); } + + byte[] bytes = null; + V v = fromVertex; + if( includeStart ) { + bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); + } + v = getNextReferenceVertex(v); // advance along the reference path + while( v != null && !v.equals(toVertex) ) { + bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); + v = getNextReferenceVertex(v); // advance along the reference path + } + if( includeStop && v != null && v.equals(toVertex)) { + bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); + } + return bytes; + } + + /** + * Convenience function to add multiple vertices to the graph at once + * @param vertices one or more vertices to add + */ + public void addVertices(final V... vertices) { + for ( final V v : vertices ) + addVertex(v); + } + + /** + * Convenience function to add multiple vertices to the graph at once + * @param vertices one or more vertices to add + */ + public void addVertices(final Collection vertices) { + for ( final V v : vertices ) + addVertex(v); + } + + /** + * Convenience function to add multiple edges to the graph + * @param start the first vertex to connect + * @param remaining all additional vertices to connect + */ + public void addEdges(final V start, final V... remaining) { + V prev = start; + for ( final V next : remaining ) { + addEdge(prev, next); + prev = next; + } + } + + /** + * Convenience function to add multiple edges to the graph + * @param start the first vertex to connect + * @param remaining all additional vertices to connect + */ + public void addEdges(final E template, final V start, final V... remaining) { + V prev = start; + for ( final V next : remaining ) { + addEdge(prev, next, (E)(template.copy())); // TODO -- is there a better way to do this? + prev = next; + } + } + + /** + * Get the set of vertices connected by outgoing edges of V + * @param v a non-null vertex + * @return a set of vertices connected by outgoing edges from v + */ + public Set outgoingVerticesOf(final V v) { + final Set s = new LinkedHashSet(); + for ( final E e : outgoingEdgesOf(v) ) { + s.add(getEdgeTarget(e)); + } + return s; + } + + /** + * Get the set of vertices connected to v by incoming edges + * @param v a non-null vertex + * @return a set of vertices {X} connected X -> v + */ + public Set incomingVerticesOf(final V v) { + final Set s = new LinkedHashSet(); + for ( final E e : incomingEdgesOf(v) ) { + s.add(getEdgeSource(e)); + } + return s; + } + + /** + * Get the set of vertices connected to v by incoming or outgoing edges + * @param v a non-null vertex + * @return a set of vertices {X} connected X -> v or v -> Y + */ + public Set neighboringVerticesOf(final V v) { + final Set s = incomingVerticesOf(v); + s.addAll(outgoingVerticesOf(v)); + return s; + } + + /** + * Print out the graph in the dot language for visualization + * @param destination File to write to + */ + public void printGraph(final File destination, final int pruneFactor) { + PrintStream stream = null; + + try { + stream = new PrintStream(new FileOutputStream(destination)); + printGraph(stream, true, pruneFactor); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } finally { + if ( stream != null ) stream.close(); + } + } + + public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) { + if ( writeHeader ) + graphWriter.println("digraph assemblyGraphs {"); + + for( final E edge : edgeSet() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];"); + if( edge.isRef() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); + } + } + + for( final V v : vertexSet() ) { +// graphWriter.println("\t" + v.toString() + " [label=\"" + v + "\",shape=box]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]"); + } + + if ( writeHeader ) + graphWriter.println("}"); + } + + /** + * Remove edges that are connected before the reference source and after the reference sink + * + * Also removes all vertices that are orphaned by this process + */ + public void cleanNonRefPaths() { + if( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) { + return; + } + + // Remove non-ref edges connected before and after the reference path + final Set edgesToCheck = new HashSet(); + edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex())); + while( !edgesToCheck.isEmpty() ) { + final E e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) ); + removeEdge(e); + } + edgesToCheck.remove(e); + } + + edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex())); + while( !edgesToCheck.isEmpty() ) { + final E e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) ); + removeEdge(e); + } + edgesToCheck.remove(e); + } + + removeSingletonOrphanVertices(); + } + + /** + * Prune all chains from this graph where any edge in the path has multiplicity < pruneFactor + * + * @see LowWeightChainPruner for more information + * + * @param pruneFactor all edges with multiplicity < this factor that aren't ref edges will be removed + */ + public void pruneLowWeightChains( final int pruneFactor ) { + final LowWeightChainPruner pruner = new LowWeightChainPruner<>(pruneFactor); + pruner.pruneLowWeightChains(this); + } + + /** + * Remove all vertices in the graph that have in and out degree of 0 + */ + public void removeSingletonOrphanVertices() { + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new LinkedList<>(); + for( final V v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 && !isRefSource(v) ) { + verticesToRemove.add(v); + } + } + removeAllVertices(verticesToRemove); + } + + /** + * Remove all vertices on the graph that cannot be accessed by following any edge, + * regardless of its direction, from the reference source vertex + */ + public void removeVerticesNotConnectedToRefRegardlessOfEdgeDirection() { + final HashSet toRemove = new HashSet<>(vertexSet()); + + final V refV = getReferenceSourceVertex(); + if ( refV != null ) { + for ( final V v : new BaseGraphIterator<>(this, refV, true, true) ) { + toRemove.remove(v); + } + } + + removeAllVertices(toRemove); + } + + /** + * Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex + * + * More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection, + * as it requires vertices to not only be connected by a series of directed edges but also prunes away + * paths that do not also meet eventually with the reference sink vertex + */ + public void removePathsNotConnectedToRef() { + if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) { + throw new IllegalStateException("Graph must have ref source and sink vertices"); + } + + // get the set of vertices we can reach by going forward from the ref source + final Set onPathFromRefSource = new HashSet<>(vertexSet().size()); + for ( final V v : new BaseGraphIterator<>(this, getReferenceSourceVertex(), false, true) ) { + onPathFromRefSource.add(v); + } + + // get the set of vertices we can reach by going backward from the ref sink + final Set onPathFromRefSink = new HashSet<>(vertexSet().size()); + for ( final V v : new BaseGraphIterator<>(this, getReferenceSinkVertex(), true, false) ) { + onPathFromRefSink.add(v); + } + + // we want to remove anything that's not in both the sink and source sets + final Set verticesToRemove = new HashSet<>(vertexSet()); + onPathFromRefSource.retainAll(onPathFromRefSink); + verticesToRemove.removeAll(onPathFromRefSource); + removeAllVertices(verticesToRemove); + + // simple sanity checks that this algorithm is working. + if ( getSinks().size() > 1 ) { + throw new IllegalStateException("Should have eliminated all but the reference sink, but found " + getSinks()); + } + + if ( getSources().size() > 1 ) { + throw new IllegalStateException("Should have eliminated all but the reference source, but found " + getSources()); + } + } + + /** + * Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure + * + * By similar this means that both graphs have the same number of vertices, where each vertex can find + * a vertex in the other graph that's seqEqual to it. A similar constraint applies to the edges, + * where all edges in g1 must have a corresponding edge in g2 where both source and target vertices are + * seqEqual + * + * @param g1 the first graph to compare + * @param g2 the second graph to compare + * @param the type of the nodes in those graphs + * @return true if g1 and g2 are equals + */ + public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { + final Set vertices1 = g1.vertexSet(); + final Set vertices2 = g2.vertexSet(); + final Set edges1 = g1.edgeSet(); + final Set edges2 = g2.edgeSet(); + + if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() ) + return false; + + for ( final T v1 : vertices1 ) { + boolean found = false; + for ( final T v2 : vertices2 ) + found = found || v1.getSequenceString().equals(v2.getSequenceString()); + if ( ! found ) return false; + } + + for( final E e1 : g1.edgeSet() ) { + boolean found = false; + for( E e2 : g2.edgeSet() ) { + if( g1.seqEquals(e1, e2, g2) ) { found = true; break; } + } + if( !found ) { return false; } + } + for( final E e2 : g2.edgeSet() ) { + boolean found = false; + for( E e1 : g1.edgeSet() ) { + if( g2.seqEquals(e2, e1, g1) ) { found = true; break; } + } + if( !found ) { return false; } + } + return true; + } + + // For use when comparing edges across graphs! + private boolean seqEquals( final E edge1, final E edge2, final BaseGraph graph2 ) { + return (this.getEdgeSource(edge1).seqEquals(graph2.getEdgeSource(edge2))) && (this.getEdgeTarget(edge1).seqEquals(graph2.getEdgeTarget(edge2))); + } + + + /** + * Get the incoming edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single incoming edge to v, or null if none exists + */ + public E incomingEdgeOf(final V v) { + return getSingletonEdge(incomingEdgesOf(v)); + } + + /** + * Get the outgoing edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single outgoing edge from v, or null if none exists + */ + public E outgoingEdgeOf(final V v) { + return getSingletonEdge(outgoingEdgesOf(v)); + } + + /** + * Helper function that gets the a single edge from edges, null if edges is empty, or + * throws an error is edges has more than 1 element + * @param edges a set of edges + * @return a edge + */ + @Requires("edges != null") + private E getSingletonEdge(final Collection edges) { + if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges); + return edges.isEmpty() ? null : edges.iterator().next(); + } + + /** + * Add edge between source -> target if none exists, or add e to an already existing one if present + * + * @param source source vertex + * @param target vertex + * @param e edge to add + */ + public void addOrUpdateEdge(final V source, final V target, final E e) { + final E prev = getEdge(source, target); + if ( prev != null ) { + prev.add(e); + } else { + addEdge(source, target, e); + } + } + + @Override + public String toString() { + return "BaseGraph{" + + "kmerSize=" + kmerSize + + '}'; + } + + /** + * Get the set of vertices within distance edges of source, regardless of edge direction + * + * @param source the source vertex to consider + * @param distance the distance + * @return a set of vertices within distance of source + */ + protected Set verticesWithinDistance(final V source, final int distance) { + if ( distance == 0 ) + return Collections.singleton(source); + + final Set found = new HashSet<>(); + found.add(source); + for ( final V v : neighboringVerticesOf(source) ) { + found.addAll(verticesWithinDistance(v, distance - 1)); + } + + return found; + } + + /** + * Get a graph containing only the vertices within distance edges of target + * @param target a vertex in graph + * @param distance the max distance + * @return a non-null graph + */ + public BaseGraph subsetToNeighbors(final V target, final int distance) { + if ( target == null ) throw new IllegalArgumentException("Target cannot be null"); + if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target); + if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance); + + + final Set toKeep = verticesWithinDistance(target, distance); + final Set toRemove = new HashSet<>(vertexSet()); + toRemove.removeAll(toKeep); + + final BaseGraph result = (BaseGraph)clone(); + result.removeAllVertices(toRemove); + + return result; + } + + /** + * Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex + * @return a non-null subgraph of this graph + */ + public BaseGraph subsetToRefSource() { + return subsetToNeighbors(getReferenceSourceVertex(), 10); + } + + /** + * Checks whether the graph contains all the vertices in a collection. + * + * @param vertices the vertices to check. + * + * @throws IllegalArgumentException if {@code vertices} is {@code null}. + * + * @return {@code true} if all the vertices in the input collection are present in this graph. + * Also if the input collection is empty. Otherwise it returns {@code false}. + */ + public boolean containsAllVertices(final Collection vertices) { + if (vertices == null) throw new IllegalArgumentException("the input vertices collection cannot be null"); + for (final V vertex : vertices) + if (!containsVertex(vertex)) return false; + return true; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java new file mode 100644 index 000000000..cf95f6a5a --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -0,0 +1,117 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import com.google.java.contract.Ensures; + +/** + * simple node class for storing kmer sequences + * + * User: ebanks, mdepristo + * Date: Mar 23, 2011 + */ +public class DeBruijnVertex extends BaseVertex { + private final static byte[][] sufficesAsByteArray = new byte[256][]; + static { + for ( int i = 0; i < sufficesAsByteArray.length; i++ ) + sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)}; + } + + public DeBruijnVertex( final byte[] sequence ) { + super(sequence); + } + + /** + * Get the kmer size for this DeBruijnVertex + * @return integer >= 1 + */ + @Ensures("result >= 1") + public int getKmerSize() { + return sequence.length; + } + + /** + * Get the string representation of the suffix of this DeBruijnVertex + * @return a non-null non-empty string + */ + @Ensures({"result != null", "result.length() >= 1"}) + public String getSuffixString() { + return new String(getSuffixAsArray()); + } + + /** + * Get the suffix byte of this DeBruijnVertex + * + * The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT + * getSuffix would return T + * + * @return a byte + */ + public byte getSuffix() { + return sequence[getKmerSize() - 1]; + } + + /** + * Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory. + * + * Should not be modified + * + * @return a byte[] that contains 1 byte == getSuffix() + */ + @Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"}) + private byte[] getSuffixAsArray() { + return sufficesAsByteArray[getSuffix()]; + } + + /** + * {@inheritDoc} + */ + @Override + public byte[] getAdditionalSequence(boolean source) { + return source ? super.getAdditionalSequence(source) : getSuffixAsArray(); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java new file mode 100644 index 000000000..ae270ed7b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * Represents a trivial k-best sub haplotype finder with no solutions. + * + *

To be used at vertices that do not have any valid path to the requested sink vertices

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder { + + /** + * Sole instance of this class. + */ + public static DeadEndKBestSubHaplotypeFinder INSTANCE = new DeadEndKBestSubHaplotypeFinder(); + + /** + * Prevents instantiation of more than one instance; please use {@link #INSTANCE}. + */ + protected DeadEndKBestSubHaplotypeFinder() { + } + + @Override + public int getCount() { + return 0; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) + throw new IllegalArgumentException("k cannot be negative"); + else + throw new IllegalArgumentException("k cannot be equal or greater to the haplotype count"); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java new file mode 100644 index 000000000..0e50ec02b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java @@ -0,0 +1,147 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * Trivial k-best sub-haplotype finder where the source and sink vertex are the same one. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { + + /** + * Caches the only solution returned by this finder. + */ + private final KBestHaplotype singleHaplotypePath; + + /** + * Constructs a new empty k-best haplotype finder. + * + * @param graph the search graph. + * @param vertex the source and sink vertex of the only solution returned by this finder. + */ + public EmptyPathHaplotypeFinderNode(final SeqGraph graph, final SeqVertex vertex) { + singleHaplotypePath = new MyBestHaplotypePath(graph,vertex); + } + + @Override + public int getCount() { + return 1; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) + throw new IllegalArgumentException("k cannot be negative"); + if (k > 0) + throw new IllegalArgumentException("k cannot greater than the possible haplotype count"); + return singleHaplotypePath; + } + + /** + * Custom extension of {@link KBestHaplotype} that implements the single solution behaviour. + */ + private class MyBestHaplotypePath extends KBestHaplotype { + + /** + * The solution's only vertex. + */ + private final SeqVertex vertex; + + /** + * The search graph. + */ + private final SeqGraph graph; + + /** + * Whether the vertex is a reference vertex. + * + *

Initialize lazily.

+ */ + private Boolean isReference; + + /** + * Constructs a new empty k-best haplotype solution. + * + * @param graph the search graph. + * @param vertex the source and sink vertex of the only solution returned by the outer finder. + */ + public MyBestHaplotypePath(final SeqGraph graph, final SeqVertex vertex) { + this.vertex = vertex; + this.graph = graph; + } + + @Override + public SeqGraph graph() { + return graph; + } + + @Override + public int score() { + return 0; + } + + @Override + public int rank() { + return 0; + } + + @Override + protected SeqVertex head() { + return vertex; + } + + @Override + protected KBestHaplotype tail() { + return null; + } + + @Override + public boolean isReference() { + return (isReference != null) ? isReference: (isReference = graph.isReferenceNode(vertex)); + } + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java new file mode 100644 index 000000000..ca22f17ec --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java @@ -0,0 +1,171 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +/** + * Represents a result from a K-best haplotype search. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public abstract class KBestHaplotype implements Comparable { + + /** + * Returns the original graph searched. + * + * @return never {@code null} + */ + public abstract SeqGraph graph(); + + /** + * Returns the result haplotype score. + * + *

Currently, the score is the multiplicity total sum of edges along the haplotype path

+ * + * @return 0 or greater. + */ + public abstract int score(); + + /** + * Indicates whether this result is the reference haplotype. + * + * @return {@code true} if it is the reference haplotype, {@code false} otherwise. + */ + public abstract boolean isReference(); + + /** + * The rank of this solution within the list of solutions that resulted from the same search. + * + *

0 would correspond to the best solution, 1 with the second best and so on

+ * + * @return 0 or greater. + */ + public abstract int rank(); + + private byte[] bases; + + private Haplotype haplotype; + + private Path path; + + /** + * Returns the result haplotype bases. + * + * @return never {@code null}. + */ + public byte[] bases() { + if (bases != null) return bases; + final KBestHaplotype tail = tail(); + final SeqVertex head = head(); + if (tail == null) + bases = head.getSequence(); + else { + final byte[] tailBases = tail.bases(); + final byte[] headBases = head.getSequence(); + final int length = tailBases.length + headBases.length; + bases = new byte[length]; + System.arraycopy(headBases,0,bases,0,headBases.length); + System.arraycopy(tailBases,0,bases,headBases.length,tailBases.length); + } + return bases; + } + + /** + * Returns the solution haplotype. + * + * @return never {@code null}. + */ + public Haplotype haplotype() { + if (haplotype != null) return haplotype; + haplotype = new Haplotype(bases(),isReference()); + haplotype.setScore(score()); + return haplotype; + } + + /** + * Returns the path across the original graph that correspond to the solution haplotype. + * + * @return never {@code null}, although perhaps a zero-length path (only one vertex). + */ + public Path path() { + if (path != null) return path; + final KBestHaplotype tail = tail(); + if (tail == null) + path = new Path<>(head(),graph()); + else { + final Path tailPath = tail.path(); + path = new Path<>(graph().getEdge(head(),tailPath.getFirstVertex()),tailPath); + } + return path; + } + + /** + * Compares k-best haplotypes based on the score where the one with larger score comes first (descending orther). + * + * @param other the other haplotype to compare to. + * @return {@code -1} if the current score is larger than {@code other}'s, {@code 0} if they are the same, {@code 1} + * if {@code other}'s score is larger. + */ + public int compareTo(final KBestHaplotype other) { + if (other == null) throw new IllegalArgumentException("the other object cannot be null"); + return - 1 * (score() - other.score()); + } + + /** + * The first vertex on the haplotype path. + * + * @return never {@code null}. + */ + protected abstract SeqVertex head(); + + /** + * Returns the sub-haplotype from the second vertex involved in the haplotype until the end. + * + * @return {@code null} if there are no more vertices in the solution path a part from the one returned by {@link #head}. + */ + protected abstract KBestHaplotype tail(); +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java new file mode 100644 index 000000000..f27cca12c --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java @@ -0,0 +1,352 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.jgrapht.alg.CycleDetector; + +import java.util.*; + +/** + * Efficient algorithm to obtain the list of best haplotypes given the {@link SeqGraph instace}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class KBestHaplotypeFinder extends AbstractList implements Iterable { + + /** + * The search graph. + */ + private final SeqGraph graph; + + /** + * Map of sub-haplotype finder by their source vertex. + */ + protected Map finderByVertex; + + /** + * Possible haplotype sink vertices. + */ + protected Set sinks; + + /** + * Possible haplotype source vertices. + */ + protected Set sources; + + /** + * The top finder. + * + *

If there is only a single source vertex, its finder is the top finder. However whent there + * is more than one possible source, we create a composite finder that alternates between individual source vertices + * for their best haplotypes.

+ */ + private final KBestSubHaplotypeFinder topFinder; + + /** + * Constructs a new best haplotypes finder. + * + * @param graph the seq-graph to search. + * @param source the source vertex for all haplotypes. + * @param sink sink vertices for all haplotypes. + * + * @throws IllegalArgumentException if
    + *
  • any of {@code graph}, {@code source} or {@code sink} is {@code null} or
  • + *
  • either {@code source} or {@code sink} is not a vertex in {@code graph}.
  • + *
+ */ + public KBestHaplotypeFinder(final SeqGraph graph, final SeqVertex source, final SeqVertex sink) { + this(graph,Collections.singleton(source),Collections.singleton(sink)); + } + + /** + * Constructs a new best haplotypes finder. + * + * @param graph the seq-graph to search. + * @param sources source vertices for all haplotypes. + * @param sinks sink vertices for all haplotypes. + * + * @throws IllegalArgumentException if
    + *
  • any of {@code graph}, {@code sources} or {@code sinks} is {@code null} or
  • + *
  • any of {@code sources}' or any {@code sinks}' member is not a vertex in {@code graph}.
  • + *
+ */ + public KBestHaplotypeFinder(final SeqGraph graph, final Set sources, final Set sinks) { + if (graph == null) throw new IllegalArgumentException("graph cannot be null"); + if (sources == null) throw new IllegalArgumentException("source cannot be null"); + if (sinks == null) throw new IllegalArgumentException("sink cannot be null"); + if (!graph.containsAllVertices(sources)) throw new IllegalArgumentException("source does not belong to the graph"); + if (!graph.containsAllVertices(sinks)) throw new IllegalArgumentException("sink does not belong to the graph"); + + //TODO dealing with cycles here due to a bug in some of the graph transformations that produces cycles. + //TODO Once that is solve, the if-else below should be substituted by a throw if there is any cycles, + //TODO just the line commented out below if you want to trade early-bug-fail for speed. + //this.graph = graph; + this.graph = new CycleDetector<>(graph).detectCycles() ? removeCycles(graph,sources,sinks) : graph; + + finderByVertex = new HashMap<>(this.graph.vertexSet().size()); + this.sinks = sinks; + this.sources = sources; + if (sinks.size() == 0 || sources.size() == 0) + topFinder = DeadEndKBestSubHaplotypeFinder.INSTANCE; + else if (sources.size() == 1) + topFinder = createVertexFinder(sources.iterator().next()); + else + topFinder = createAggregatedFinder(); + } + + /** + * Constructs a new best haplotype finder. + *

+ * It will consider all source and sink vertex when looking for haplotypes. + *

+ * + * @param graph the seq-graph to search for the best haplotypes. + */ + public KBestHaplotypeFinder(SeqGraph graph) { + this(graph,graph.getSources(),graph.getSinks()); + } + + /** + * Creates an aggregated recursive finder to try all possible source vertices. + * + * @return never {@code null}. + */ + private KBestSubHaplotypeFinder createAggregatedFinder() { + final List sourceFinders = new ArrayList<>(sources.size()); + for (final SeqVertex source : sources) + sourceFinders.add(createVertexFinder(source)); + return new AggregatedSubHaplotypeFinder(sourceFinders); + } + + /** + * Removes edges that produces cycles and also dead vertices that do not lead to any sink vertex. + * + * @param original graph to modify. + * @param sources considered source vertices. + * @param sinks considered sink vertices. + * @return never {@code null}. + */ + private static SeqGraph removeCycles(final SeqGraph original, final Set sources, final Set sinks) { + final Set edgesToRemove = new HashSet<>(original.edgeSet().size()); + final Set vertexToRemove = new HashSet<>(original.vertexSet().size()); + + boolean foundSomePath = false; + for (final SeqVertex source : sources) + foundSomePath = findGuiltyVerticesAndEdgesToRemoveCycles(original, source, sinks, edgesToRemove, + vertexToRemove, new HashSet(original.vertexSet().size())) | foundSomePath; + + if (!foundSomePath) + throw new IllegalStateException("could not find any path from the source vertex to the sink vertex after removing cycles: " + + Arrays.toString(sources.toArray()) + " => " + Arrays.toString(sinks.toArray())); + + if (edgesToRemove.isEmpty() && vertexToRemove.isEmpty()) + throw new IllegalStateException("cannot find a way to remove the cycles"); + + final SeqGraph result = (SeqGraph) original.clone(); + result.removeAllEdges(edgesToRemove); + result.removeAllVertices(vertexToRemove); + return result; + } + + /** + * Recursive call that looks for edges and vertices that need to be removed to get rid of cycles. + * + * @param graph the original graph. + * @param currentVertex current search vertex. + * @param sinks considered sink vertices. + * @param edgesToRemove collection of edges that need to be removed in order to get rid of cycles. + * @param verticesToRemove collection of vertices that can be removed. + * @param parentVertices collection of vertices that preceded the {@code currentVertex}; i.e. the it can be + * reached from those vertices using edges existing in {@code graph}. + * + * @return {@code true} to indicate that the some sink vertex is reachable by {@code currentVertex}, + * {@code false} otherwise. + */ + private static boolean findGuiltyVerticesAndEdgesToRemoveCycles(final SeqGraph graph, + final SeqVertex currentVertex, + final Set sinks, + final Set edgesToRemove, + final Set verticesToRemove, + final Set parentVertices) { + if (sinks.contains(currentVertex)) return true; + + final Set outgoingEdges = graph.outgoingEdgesOf(currentVertex); + boolean reachesSink = false; + parentVertices.add(currentVertex); + + for (final BaseEdge edge : outgoingEdges) { + final SeqVertex child = graph.getEdgeTarget(edge); + if (parentVertices.contains(child)) + edgesToRemove.add(edge); + else { + final boolean childReachSink = findGuiltyVerticesAndEdgesToRemoveCycles(graph, child, sinks, + edgesToRemove, verticesToRemove, parentVertices); + reachesSink = reachesSink || childReachSink; + } + } + parentVertices.remove(currentVertex); + if (!reachesSink) verticesToRemove.add(currentVertex); + return reachesSink; + } + + @Override + public KBestHaplotype get(int index) { + if (index < 0 || index >= size()) + throw new IndexOutOfBoundsException(); + return topFinder.getKBest(index); + } + + @Override + public Iterator iterator() { + return new Iterator() { + private int nextK = 0; + private final int maxK = topFinder.getCount(); + + + @Override + public boolean hasNext() { + return nextK < maxK; + } + + @Override + public KBestHaplotype next() { + if (nextK >= maxK) throw new NoSuchElementException(); + return topFinder.getKBest(nextK++); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @Override + public int size() { + return topFinder.getCount(); + } + + /** + * Returns an iterator on the first k best haplotypes. + *

+ * It might return less than k haplotypes if the total number of possible haplotypes is smaller. + *

+ * + * @param k the maximum number of haplotypes to return. + * @return never {@code null}, but perhaps a iterator that return no haplotype. + */ + public Iterator iterator(final int k) { + + return new Iterator() { + private int nextK = 0; + private final int maxK = Math.min(size(), k); + + @Override + public boolean hasNext() { + return nextK < maxK; + } + + @Override + public KBestHaplotype next() { + if (nextK >= maxK) throw new NoSuchElementException(); + return topFinder.getKBest(nextK++); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + /** + * Creates a finder from a vertex. + * + * @param source the source vertex for the finder. + * + * @return never {@code null}, perhaps a finder that returns no haplotypes though. + */ + protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex source) { + KBestSubHaplotypeFinder node = finderByVertex.get(source); + if (node == null) { + if (sinks.contains(source)) + node = new EmptyPathHaplotypeFinderNode(graph,source); + else { + final Set outgoingEdges = graph.outgoingEdgesOf(source); + if (outgoingEdges.isEmpty()) + node = DeadEndKBestSubHaplotypeFinder.INSTANCE; + else { + final Map undeadChildren = createChildrenFinders(outgoingEdges); + node = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE : + new RecursiveSubHaplotypeFinder(source,undeadChildren); + } + } + finderByVertex.put(source, node); + } + return node; + } + + /** + * Creates finder for target vertices of a collection of edges. + *

+ * This peculiar signature is convenient for when we want to create finders for the children of a vertex. + *

+ * + * @param baseEdges target collection of edges. + * + * @return never {@code null}, perhaps an empty map if there is no children with valid paths to any sink for this + * finder. + */ + private Map createChildrenFinders(Set baseEdges) { + final Map result = new LinkedHashMap<>(baseEdges.size()); + for (final BaseEdge edge : baseEdges) { + final KBestSubHaplotypeFinder targetFinder = createVertexFinder(graph.getEdgeTarget(edge)); + if (targetFinder.getCount() == 0) continue; + result.put(edge, targetFinder); + } + return result; + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java new file mode 100644 index 000000000..9c185b52c --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java @@ -0,0 +1,71 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * Common interface for K-Best sub-haplotype finders. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +interface KBestSubHaplotypeFinder { + + /** + * Returns the total number of possible sub-haplotypes. + * @return 0 or greater. + */ + public abstract int getCount(); + + /** + * Return the k-best sub-haplotype solution. + * + * + * @param k the requested solution rank. + * @throws IllegalArgumentException if {@code k} is outside bounds [0 .. {@link #getCount()}). + * + * @return never {@code null}. + */ + public abstract KBestHaplotype getKBest(int k); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java new file mode 100644 index 000000000..e6f460d1a --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -0,0 +1,371 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import com.google.java.contract.Ensures; +import net.sf.samtools.Cigar; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; + +import java.util.*; + +/** + * A path thought a BaseGraph + * + * class to keep track of paths + * + * User: depristo + * Date: 3/19/13 + * Time: 2:34 PM + * + */ +public class Path { + + // the last vertex seen in the path + protected final T lastVertex; + + // the list of edges comprising the path + private Set edgesAsSet = null; + protected final ArrayList edgesInOrder; + + // the scores for the path + protected final int totalScore; + + // the graph from which this path originated + protected final BaseGraph graph; + + /** + * Create a new Path containing no edges and starting at initialVertex + * @param initialVertex the starting vertex of the path + * @param graph the graph this path will follow through + */ + public Path(final T initialVertex, final BaseGraph graph) { + if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph); + + lastVertex = initialVertex; + edgesInOrder = new ArrayList<>(0); + totalScore = 0; + this.graph = graph; + } + + /** + * Convenience constructor for testing that creates a path through vertices in graph + */ + protected static Path makePath(final List vertices, final BaseGraph graph) { + Path path = new Path(vertices.get(0), graph); + for ( int i = 1; i < vertices.size(); i++ ) + path = new Path(path, graph.getEdge(path.lastVertex, vertices.get(i))); + return path; + } + + /** + * Create a new path with the same field values. + * + * @param p the template path. + * + * @throws NullPointerException if {@code p} is {@code null}. + */ + protected Path(final Path p) { + this.edgesInOrder = p.edgesInOrder; + this.lastVertex = p.lastVertex; + this.edgesAsSet = p.edgesAsSet; + this.totalScore = p.totalScore; + this.graph = p.graph; + } + + /** + * Create a new Path extending p with edge + * + * @param p the path to extend. + * @param edge the edge to extend path with. + * + * @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code p}'s graph, or {@code edge} does not have as a source the last vertex in {@code p}. + */ + public Path(final Path p, final E edge) { + if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); + if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); + if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); + if ( ! p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } + + graph = p.graph; + lastVertex = p.graph.getEdgeTarget(edge); + edgesInOrder = new ArrayList<>(p.length() + 1); + edgesInOrder.addAll(p.edgesInOrder); + edgesInOrder.add(edge); + totalScore = p.totalScore + edge.getMultiplicity(); + } + + /** + * Length of the path in edges. + * + * @return {@code 0} or greater. + */ + public int length() { + return edgesInOrder.size(); + } + + /** + * Prepend a path with an edge. + * + * @param edge the extending edge. + * @param p the original path. + * + * @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code p}'s graph, or {@code edge} does not have as a target the first vertex in {@code p}. + */ + public Path(final E edge, final Path p) { + if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); + if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); + if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); + if ( ! p.graph.getEdgeTarget(edge).equals(p.getFirstVertex())) { throw new IllegalStateException("Edges added to path must be contiguous."); } + graph = p.graph; + lastVertex = p.lastVertex; + edgesInOrder = new ArrayList<>(p.length() + 1); + edgesInOrder.add(edge); + edgesInOrder.addAll(p.getEdges()); + totalScore = p.totalScore + edge.getMultiplicity(); + } + + /** + * Get the collection of edges leaving the last vertex of this path + * @return a non-null collection + */ + public Collection getOutgoingEdgesOfLastVertex() { + return getGraph().outgoingEdgesOf(getLastVertex()); + } + + /** + * Does this path contain the given edge + * @param edge the given edge to test + * @return true if the edge is found in this path + */ + public boolean containsEdge( final E edge ) { + if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } + if ( edgesInOrder.isEmpty() ) return false; + + // initialize contains cache if necessary + if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder); + return edgesAsSet.contains(edge); + } + + /** + * Does this path contain the given vertex? + * + * @param v a non-null vertex + * @return true if v occurs within this path, false otherwise + */ + public boolean containsVertex(final T v) { + if ( v == null ) throw new IllegalArgumentException("Vertex cannot be null"); + + // TODO -- warning this is expensive. Need to do vertex caching + return getVertices().contains(v); + } + + /** + * Checks whether a given path is a suffix of this path. + * + * @param other the path to compare against. + * @throws IllegalArgumentException if other is null, or the come from + * different graphs. + * @return true if other is a suffix of this path. + */ + public boolean isSuffix(final Path other) { + if ( other == null ) throw new IllegalArgumentException("path cannot be null"); + if (other.getGraph() != this.getGraph()) throw new IllegalArgumentException("the other path most belong to the same path"); + if (!lastVertex.equals(other.lastVertex)) + return false; + final ListIterator myIt = edgesInOrder.listIterator(edgesInOrder.size()); + final ListIterator otherIt = other.edgesInOrder.listIterator(other.edgesInOrder.size()); + while (myIt.hasPrevious() && otherIt.hasPrevious()) + if (otherIt.previous() != myIt.previous()) + return false; + return !otherIt.hasPrevious(); + } + + /** + * Check that two paths have the same edges and total score + * @param path the other path we might be the same as + * @return true if this and path are the same + */ + protected boolean pathsAreTheSame(Path path) { + return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path="); + boolean first = true; + for ( final T v : getVertices() ) { + if ( first ) + first = false; + else + b.append(" -> "); + b.append(v.getSequenceString()); + } + b.append('}'); + return b.toString(); + } + + /** + * Get the graph of this path + * @return a non-null graph + */ + @Ensures("result != null") + public BaseGraph getGraph() { + return graph; + } + + /** + * Get the edges of this path in order + * @return a non-null list of edges + */ + @Ensures("result != null") + public List getEdges() { return edgesInOrder; } + + /** + * Get the list of vertices in this path in order defined by the edges of the path + * @return a non-null, non-empty list of vertices + */ + @Ensures({"result != null", "!result.isEmpty()"}) + public List getVertices() { + if ( getEdges().isEmpty() ) + return Collections.singletonList(lastVertex); + else { + final LinkedList vertices = new LinkedList(); + boolean first = true; + for ( final E e : getEdges() ) { + if ( first ) { + vertices.add(graph.getEdgeSource(e)); + first = false; + } + vertices.add(graph.getEdgeTarget(e)); + } + return vertices; + } + } + + /** + * Get the total score of this path (bigger is better) + * @return a positive integer + */ + @Ensures("result >= 0") + public int getScore() { return totalScore; } + + /** + * Get the final vertex of the path + * @return a non-null vertex + */ + @Ensures("result != null") + public T getLastVertex() { return lastVertex; } + + /** + * Get the first vertex in this path + * @return a non-null vertex + */ + public T getFirstVertex() { + if (edgesInOrder.size() == 0) { + return lastVertex; + } else { + return getGraph().getEdgeSource(edgesInOrder.get(0)); + } + } + + /** + * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes + * @return non-null sequence of bases corresponding to this path + */ + @Ensures({"result != null"}) + public byte[] getBases() { + if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); } + + byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.get(0))); + for( final E e : edgesInOrder ) { + bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); + } + return bases; + } + + /** + * Calculate the cigar elements for this path against the reference sequence + * + * @param refSeq the reference sequence that all of the bases in this path should align to + * @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found + */ + public Cigar calculateCigar(final byte[] refSeq) { + return CigarUtils.calculateCigar(refSeq,getBases()); + } + + /** + * Tests that this and other have the same score and vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalScoreAndSequence(final Path other) { + if ( other == null ) throw new IllegalArgumentException("other cannot be null"); + return getScore() == other.getScore() && equalSequence(other); + } + + /** + * Tests that this and other have the same vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalSequence(final Path other) { + final List mine = getVertices(); + final List yours = other.getVertices(); + if ( mine.size() == yours.size() ) { // hehehe + for ( int i = 0; i < mine.size(); i++ ) + if ( ! mine.get(i).seqEquals(yours.get(i)) ) + return false; + } + return true; + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java new file mode 100644 index 000000000..0fbbfdc64 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java @@ -0,0 +1,174 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Map; + +/** +* General recursive sub-haplotype finder. +*

+* Provides the k-best sub-haplotypes from a vertex provided map between outgoing edges and its target finders +*

+*

+* This is done efficiently by keeping an priority-queue on best sub-haplotype solutions and pulling them on demand +* as needed. +*

+*

+* Solutions are cached for repeated retrieval so that we save compute at vertices that share sub-haplotypes +* (share descendant vertices). This aspect is controlled by {@link KBestSubHaplotypeFinder} that instantiate +* a unique {@link KBestSubHaplotypeFinder} for each vertex in the graph that belongs to a valid path +* between the source and sink node. +*

+* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { + + /** + * Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges + * with the corresponding sub-sub-haplotype finders. + * + *

For efficiency shake, it will not verify the content of {@code children} map; i.e. that indeed all keys + * are outgoing edges from {@code vertex} on {@code graph} and that the value sub-haplotype resolver have as + * the first vertex the adjacent vertex through that key edge.

+ * + * @param vertex first vertex for all sub-haplotype solutions provided by this finder + * @param children map from outgoing edge to the corresponding sub-sub-haplotype finder. + */ + public RecursiveSubHaplotypeFinder(final SeqVertex vertex, + final Map children) { + super(createChildFinderCollection(vertex, children)); + } + + private static Collection createChildFinderCollection(final SeqVertex vertex, final Map finders) { + if (finders == null) throw new IllegalArgumentException("the edge to child map cannot be null"); + final Collection result = new ArrayList<>(finders.size()); + for (final Map.Entry e : finders.entrySet()) + result.add(new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue())); + return result; + } + + private static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder { + + private final KBestSubHaplotypeFinder childFinder; + + private final SeqVertex vertex; + + private final BaseEdge edge; + + private EdgeSubHaplotypeFinder(final SeqVertex vertex, final BaseEdge edge, final KBestSubHaplotypeFinder childFinder) { + this.childFinder = childFinder; + this.edge = edge; + this.vertex = vertex; + } + + @Override + public int getCount() { + return childFinder.getCount(); + } + + @Override + public KBestHaplotype getKBest(int k) { + return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k)); + } + } + + /** + * Custom extension of the {@link KBestHaplotype} used for solutions generated by this class. + * + *

+ * These by delegating on the encapsulated solution from outgoing edge's finder by adding + * the edge score and prefixing this outer finder + * source vertex. + *

+ */ + private static class ChildKBestSubHaplotype extends KBestHaplotype { + + private final int score; + private final KBestHaplotype child; + private final SeqVertex vertex; + private final boolean isReference; + + public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child) { + this.score = edge.getMultiplicity() + child.score(); + this.vertex = vertex; + this.child = child; + this.isReference = edge.isRef() && child.isReference(); + } + + @Override + public SeqGraph graph() { + return child.graph(); + } + + @Override + public int score() { + return score; + } + + @Override + public int rank() { + return child.rank(); + } + + @Override + protected SeqVertex head() { + return vertex; + } + + @Override + protected KBestHaplotype tail() { + return child; + } + + @Override + public boolean isReference() { + return isReference; + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java new file mode 100644 index 000000000..4eeb18eb6 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + + +import java.util.List; +import java.util.ListIterator; + +/** + * Represents a route or path through a graph. + *

+ * In contrast with a {@link Path}, a route keeps track of the + * path taken at furcations in order to speed up some path comparisions like the + * one implemented by {@link #isSuffix}. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class Route extends Path { + + protected final Route previousRouteWithLastVertexThatIsForkOrJoin; + protected final boolean lastVertexIsForkOrJoin; + + /** + * Create a zero length route with a start in a particular vertex: + * + * @param initialVertex the first vertex of the route. + * @param graph the new route's graph. + * + * @throws IllegalArgumentException if {@code initialVertex} or {@code graph} are {@code null}. + * or if {@code initialVertex} does not belong to {@code graph}. + */ + public Route(final V initialVertex, final BaseGraph graph) { + super(initialVertex, graph); + previousRouteWithLastVertexThatIsForkOrJoin = null; + lastVertexIsForkOrJoin = graph.inDegreeOf(initialVertex) > 1; + } + + @Override + public boolean equals(final Object other) { + if (other == null) return false; + if (other == this) return true; + if (! (other instanceof Route)) return false; + @SuppressWarnings("unchecked") + final Route otherRoute = (Route) other; + return otherRoute.length() == this.length() && isSuffix(otherRoute); + } + + /** + * Extends a route into a new instance. + * + * @param prefix the route to extend. + * @param nextVertex the vertex to extend the route to. + * + * @throws IllegalArgumentException if {@code prefix} is {@code null} or {@code nextVertex} is {@code null} + * or {@code nextVertex} does not belong to {@code prefix}'s graph or there is no edge that in the graph + * that would connect {@code prefix}'s last vertex with {@code nextVertex} directly. + */ + public Route(final Route prefix, final V nextVertex) { + this(prefix,resolveSuffixEdge(prefix,nextVertex)); + } + + + /** + * Extends a route into a new instance. + * + * @param prevVertex the vertex to extend the route to. + * @param suffix the route to extend. + * + * @throws IllegalArgumentException if {@code suffix} is {@code null} or {@code prevVertex} is {@code null} + * or {@code prevVertex} does not belong to {@code suffix}'s graph or there is no edge that in the graph + * that would connect {@code suffix}'s first vertex with {@code prevVertex} directly. + */ + public Route(final V prevVertex, final Route suffix) { + this(resolvePrefixEdge(prevVertex, suffix),suffix); + } + + /** + * Resolves the prefix edge as required by {@link Route(V,Route)}. + */ + private static E resolvePrefixEdge(final V prevVertex, final Route suffix) { + if (prevVertex == null) throw new NullPointerException(); + if (!suffix.getGraph().containsVertex(prevVertex)) throw new IllegalArgumentException(); + final E result = suffix.getGraph().getEdge(prevVertex,suffix.getFirstVertex()); + if (result == null) + throw new IllegalArgumentException("there is no such edge in the graph"); + return result; + } + + /** + * Resolves the suffix edge as required by {@link Route(Route,V)} + */ + private static E resolveSuffixEdge(final Route prefix, final V nextVertex) { + if (nextVertex == null) throw new IllegalArgumentException(); + if (!prefix.getGraph().containsVertex(nextVertex)) throw new IllegalArgumentException(); + final E result = prefix.getGraph().getEdge(prefix.getLastVertex(),nextVertex); + if (result == null) + throw new IllegalArgumentException("there is no such edge in the graph"); + return result; + } + + /** + * Extends a route by prefixing an edge. + * + * @param initialEdge the extending edge. + * @param suffix the original path. + * + * @throws IllegalArgumentException if {@code suffix} or {@code initialEdge} are {@code null}, or {@code initialEdge} is + * not part of {@code suffix}'s graph, or {@code initialEdge} does not have as a target the first vertex in {@code suffix}. + */ + public Route(final E initialEdge, final Route suffix) { + super(initialEdge,suffix); + final V firstVertex = getFirstVertex(); + if(suffix.length() == 0) { + lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin || graph.outDegreeOf(firstVertex) > 1; + previousRouteWithLastVertexThatIsForkOrJoin = graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; + } else { + lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin; + if (suffix.previousRouteWithLastVertexThatIsForkOrJoin != null) + previousRouteWithLastVertexThatIsForkOrJoin = new Route<>(initialEdge,suffix.previousRouteWithLastVertexThatIsForkOrJoin); + else + previousRouteWithLastVertexThatIsForkOrJoin = graph.outDegreeOf(firstVertex) > 1 ? + new Route<>(new Route<>(firstVertex,graph),edgesInOrder.get(0)) : + graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; + } + } + + /** + * Create copy of an existing route. + * @param route the route to copy + * + * @throws NullPointerException if {@code route} is {@code null}. + */ + protected Route(final Route route) { + super(route); + lastVertexIsForkOrJoin = route.lastVertexIsForkOrJoin; + previousRouteWithLastVertexThatIsForkOrJoin = route.previousRouteWithLastVertexThatIsForkOrJoin; + } + + /** + * Create a new Route extending another one with an edge + * + * @param route the route to extend. + * @param edge the edge to extend the route with. + * + * @throws IllegalArgumentException if {@code route} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code route}'s graph, or {@code edge} does not have as a source the last vertex in {@code route}. + */ + public Route(final Route route, final E edge) { + super(route, edge); + lastVertexIsForkOrJoin = graph.outDegreeOf(route.lastVertex) > 1 || graph.inDegreeOf(lastVertex) > 1; + previousRouteWithLastVertexThatIsForkOrJoin = route.lastVertexIsForkOrJoin ? route : route.previousRouteWithLastVertexThatIsForkOrJoin; + } + + @Override + public boolean isSuffix(final Path other) { + if (other == this) + return true; + else if (other == null) + throw new IllegalArgumentException("other path must not be null"); + else if (getGraph() != other.getGraph()) + throw new IllegalArgumentException("other path must be part of the same graph"); + else if (other instanceof Route) + return isRouteSuffix((Route)other); + else + return super.isSuffix(other); + } + + @Override + public String toString() { + return super.toString().replace("Path{", "Route{"); + } + + /** + * Faster version when comparing with a route. + */ + protected boolean isRouteSuffix(final Route other) { + if (other.getGraph() != this.getGraph()) + throw new IllegalArgumentException("you cannot compare routes on different graphs"); + else if (lastVertex != other.lastVertex) // obvious case. + return false; + else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null + && other.previousRouteWithLastVertexThatIsForkOrJoin != null) // I am shorter or different path for sure. + return false; + else if (this.edgesInOrder.size() < other.edgesInOrder.size()) // I am shorter regardless of path, no way Jose! + return false; + else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null || other.previousRouteWithLastVertexThatIsForkOrJoin == null) { + final ListIterator myEdges = edgesInOrder.listIterator(edgesInOrder.size()); + final ListIterator otherEdges = other.edgesInOrder.listIterator(other.edgesInOrder.size()); + while (otherEdges.hasPrevious()) + if (myEdges.previous() != otherEdges.previous()) + return false; + return true; + } else + return (other.previousRouteWithLastVertexThatIsForkOrJoin == this.previousRouteWithLastVertexThatIsForkOrJoin) + || (previousRouteWithLastVertexThatIsForkOrJoin.lastVertex == other.previousRouteWithLastVertexThatIsForkOrJoin.lastVertex + && previousRouteWithLastVertexThatIsForkOrJoin.isRouteSuffix(other.previousRouteWithLastVertexThatIsForkOrJoin)); + } + + /** + * Checks whether the last vertex in the route is a fork or a joining vertex. + * @return {@code true} iff so. + */ + public boolean lastVertexIsForkOrJoin() { + return lastVertexIsForkOrJoin; + } + + /** + * Returns the longest prefix route that has as a last vertex a join or furcation vertex. + * + * @return never {@code null}. + */ + public Route getPrefixRouteWithLastVertexThatIsForkOrJoin() { + return previousRouteWithLastVertexThatIsForkOrJoin; + } + + + + /** + * Splice out the first few vertices of the route. + * + * @param length how many vertices to splice out + * @return a new route without those spliced vertices. + * + * @throws IllegalArgumentException if {@code length} is equal to the route's length or greater or if it is negative. + * Notice that non-vertex route are no legal routes. + */ + public Route splicePrefix(final int length) { + if (length == 0) + return this; + if (length >= length()) + throw new IllegalArgumentException("prefix slicing to long"); + if (length < 0) + throw new IllegalArgumentException("prefix cannot be negative"); + + final List resultEdges = getEdges().subList(length,length()); + Route result = new Route<>(graph.getEdgeSource(resultEdges.get(0)),graph); + for (final E edge : resultEdges) + result = new Route<>(result,edge); + return result; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java new file mode 100644 index 000000000..c696c50ae --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java @@ -0,0 +1,526 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import com.google.java.contract.Ensures; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.smithwaterman.*; +import org.jgrapht.EdgeFactory; + +import java.util.*; + +public abstract class DanglingChainMergingGraph extends BaseGraph { + + private static final int MAX_CIGAR_COMPLEXITY = 3; + private static final int MIN_DANGLING_TAIL_LENGTH = 5; // SNP + 3 stabilizing nodes + the LCA + private static final int MAXIMUM_MISMATCHES_IN_DANGLING_HEAD_MERGE = 1; + + protected boolean alreadyBuilt; + + /** + * Create a new ReadThreadingAssembler using kmerSize for matching + * @param kmerSize must be >= 1 + */ + protected DanglingChainMergingGraph(final int kmerSize, final EdgeFactory edgeFactory) { + super(kmerSize, edgeFactory); + } + + /** + * Edge factory that encapsulates the numPruningSamples assembly parameter + */ + protected static class MyEdgeFactory implements EdgeFactory { + final int numPruningSamples; + + public MyEdgeFactory(int numPruningSamples) { + this.numPruningSamples = numPruningSamples; + } + + @Override + public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) { + return new MultiSampleEdge(false, 1, numPruningSamples); + } + + public MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) { + return new MultiSampleEdge(isRef, multiplicity, numPruningSamples); + } + + } + + /** + * Class to keep track of the important dangling chain merging data + */ + protected static final class DanglingChainMergeHelper { + final List danglingPath, referencePath; + final byte[] danglingPathString, referencePathString; + final Cigar cigar; + + public DanglingChainMergeHelper(final List danglingPath, + final List referencePath, + final byte[] danglingPathString, + final byte[] referencePathString, + final Cigar cigar) { + this.danglingPath = danglingPath; + this.referencePath = referencePath; + this.danglingPathString = danglingPathString; + this.referencePathString = referencePathString; + this.cigar = cigar; + } + } + + /** + * Try to recover dangling tails + * + * @param pruneFactor the prune factor to use in ignoring chain pieces + */ + public void recoverDanglingTails(final int pruneFactor) { + if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); + + int attempted = 0; + int nRecovered = 0; + for ( final MultiDeBruijnVertex v : vertexSet() ) { + if ( outDegreeOf(v) == 0 && ! isRefSink(v) ) { + attempted++; + nRecovered += recoverDanglingTail(v, pruneFactor); + } + } + + logger.debug("Recovered " + nRecovered + " of " + attempted + " dangling tails"); + } + + /** + * Try to recover dangling heads + * + * @param pruneFactor the prune factor to use in ignoring chain pieces + */ + public void recoverDanglingHeads(final int pruneFactor) { + if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingHeads requires the graph be already built"); + + // we need to build a list of dangling heads because that process can modify the graph (and otherwise generate + // a ConcurrentModificationException if we do it while iterating over the vertexes) + final List danglingHeads = new ArrayList<>(); + + int attempted = 0; + int nRecovered = 0; + for ( final MultiDeBruijnVertex v : vertexSet() ) { + if ( inDegreeOf(v) == 0 && ! isRefSource(v) ) + danglingHeads.add(v); + } + + // now we can try to recover the dangling heads + for ( final MultiDeBruijnVertex v : danglingHeads ) { + attempted++; + nRecovered += recoverDanglingHead(v, pruneFactor); + } + + logger.debug("Recovered " + nRecovered + " of " + attempted + " dangling heads"); + } + + /** + * Attempt to attach vertex with out-degree == 0 to the graph + * + * @param vertex the vertex to recover + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return 1 if we successfully recovered the vertex and 0 otherwise + */ + protected int recoverDanglingTail(final MultiDeBruijnVertex vertex, final int pruneFactor) { + if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); + + // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths + final DanglingChainMergeHelper danglingTailMergeResult = generateCigarAgainstDownwardsReferencePath(vertex, pruneFactor); + + // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path + if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar, false, true) ) + return 0; + + // merge + return mergeDanglingTail(danglingTailMergeResult); + } + + /** + * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph + * + * @param vertex the vertex to recover + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return 1 if we successfully recovered a vertex and 0 otherwise + */ + protected int recoverDanglingHead(final MultiDeBruijnVertex vertex, final int pruneFactor) { + if ( inDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling head for " + vertex + " but it has in-degree > 0"); + + // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths + final DanglingChainMergeHelper danglingHeadMergeResult = generateCigarAgainstUpwardsReferencePath(vertex, pruneFactor); + + // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path + if ( danglingHeadMergeResult == null || ! cigarIsOkayToMerge(danglingHeadMergeResult.cigar, true, false) ) + return 0; + + // merge + return mergeDanglingHead(danglingHeadMergeResult); + } + + /** + * Determine whether the provided cigar is okay to merge into the reference path + * + * @param cigar the cigar to analyze + * @param requireFirstElementM if true, require that the first cigar element be an M operator in order for it to be okay + * @param requireLastElementM if true, require that the last cigar element be an M operator in order for it to be okay + * @return true if it's okay to merge, false otherwise + */ + protected boolean cigarIsOkayToMerge(final Cigar cigar, final boolean requireFirstElementM, final boolean requireLastElementM) { + + final List elements = cigar.getCigarElements(); + final int numElements = elements.size(); + + // don't allow more than a couple of different ops + if ( numElements == 0 || numElements > MAX_CIGAR_COMPLEXITY ) + return false; + + // the first element must be an M + if ( requireFirstElementM && elements.get(0).getOperator() != CigarOperator.M ) + return false; + + // the last element must be an M + if ( requireLastElementM && elements.get(numElements - 1).getOperator() != CigarOperator.M ) + return false; + + // TODO -- do we want to check whether the Ms mismatch too much also? + + return true; + } + + /** + * Actually merge the dangling tail if possible + * + * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference + * @return 1 if merge was successful, 0 otherwise + */ + protected int mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResult) { + + final List elements = danglingTailMergeResult.cigar.getCigarElements(); + final CigarElement lastElement = elements.get(elements.size() - 1); + if ( lastElement.getOperator() != CigarOperator.M ) + throw new IllegalArgumentException("The last Cigar element must be an M"); + + final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; + final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); + if ( matchingSuffix == 0 ) + return 0; + + final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); + + // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a + // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling + // tail is a perfect match to the suffix of the reference path. In this case we need to push the reference index to merge + // down one position so that we don't incorrectly cut a base off of the deletion. + final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D; + final boolean mustHandleLeadingDeletionCase = firstElementIsDeletion && (elements.get(0).getLength() + matchingSuffix == lastRefIndex + 1); + final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0); + + // another edge condition occurs here: if Smith-Waterman places the whole tail into an insertion then it will try to + // merge back to the LCA, which results in a cycle in the graph. So we do not want to merge in such a case. + if ( refIndexToMerge == 0 ) + return 0; + + // it's safe to merge now + addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); + + return 1; + } + + /** + * Actually merge the dangling head if possible + * + * @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference + * @return 1 if merge was successful, 0 otherwise + */ + protected int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) { + + final List elements = danglingHeadMergeResult.cigar.getCigarElements(); + final CigarElement firstElement = elements.get(0); + if ( firstElement.getOperator() != CigarOperator.M ) + throw new IllegalArgumentException("The first Cigar element must be an M"); + + final int indexesToMerge = bestPrefixMatch(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength()); + if ( indexesToMerge <= 0 ) + return 0; + + // we can't push back the reference path + if ( indexesToMerge >= danglingHeadMergeResult.referencePath.size() - 1 ) + return 0; + + // but we can manipulate the dangling path if we need to + if ( indexesToMerge >= danglingHeadMergeResult.danglingPath.size() && + ! extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge - danglingHeadMergeResult.danglingPath.size() + 2) ) + return 0; + + addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge+1), danglingHeadMergeResult.danglingPath.get(indexesToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); + + return 1; + } + + /** + * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the + * provided vertex is the sink) and the reference path. + * + * @param vertex the sink of the dangling chain + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return a SmithWaterman object which can be null if no proper alignment could be generated + */ + protected DanglingChainMergeHelper generateCigarAgainstDownwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) { + + // find the lowest common ancestor path between vertex and the reference sink if available + final List altPath = findPathUpwardsToLowestCommonAncestorOfReference(vertex, pruneFactor); + if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH ) + return null; + + // now get the reference path from the LCA + final List refPath = getReferencePath(altPath.get(0), TraversalDirection.downwards); + + // create the Smith-Waterman strings to use + final byte[] refBases = getBasesForPath(refPath, false); + final byte[] altBases = getBasesForPath(altPath, false); + + // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) + final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL); + return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); + } + + /** + * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the + * provided vertex is the source) and the reference path. + * + * @param vertex the source of the dangling head + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return a SmithWaterman object which can be null if no proper alignment could be generated + */ + protected DanglingChainMergeHelper generateCigarAgainstUpwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) { + + // find the highest common descendant path between vertex and the reference source if available + final List altPath = findPathDownwardsToHighestCommonDescendantOfReference(vertex, pruneFactor); + if ( altPath == null || isRefSink(altPath.get(0)) ) + return null; + + // now get the reference path from the LCA + final List refPath = getReferencePath(altPath.get(0), TraversalDirection.upwards); + + // create the Smith-Waterman strings to use + final byte[] refBases = getBasesForPath(refPath, true); + final byte[] altBases = getBasesForPath(altPath, true); + + // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) + final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL); + return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); + } + + /** + * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex. + * Note that nodes are excluded if their pruning weight is less than the pruning factor. + * + * @param vertex the original vertex + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or + * has an ancestor with multiple incoming edges before hitting the reference path + */ + protected List findPathUpwardsToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { + final LinkedList path = new LinkedList<>(); + + MultiDeBruijnVertex v = vertex; + while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { + final MultiSampleEdge edge = incomingEdgeOf(v); + // if it has too low a weight, don't use it (or previous vertexes) for the path + if ( edge.getPruningMultiplicity() < pruneFactor ) + path.clear(); + // otherwise it is safe to use + else + path.addFirst(v); + v = getEdgeSource(edge); + } + path.addFirst(v); + + return isReferenceNode(v) ? path : null; + } + + /** + * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex. + * However note that the path is reversed so that this vertex ends up at the end of the path. + * Also note that nodes are excluded if their pruning weight is less than the pruning factor. + * + * @param vertex the original vertex + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or + * has a descendant with multiple outgoing edges before hitting the reference path + */ + protected List findPathDownwardsToHighestCommonDescendantOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { + final LinkedList path = new LinkedList<>(); + + MultiDeBruijnVertex v = vertex; + while ( ! isReferenceNode(v) && outDegreeOf(v) == 1 ) { + final MultiSampleEdge edge = outgoingEdgeOf(v); + // if it has too low a weight, don't use it (or previous vertexes) for the path + if ( edge.getPruningMultiplicity() < pruneFactor ) + path.clear(); + // otherwise it is safe to use + else + path.addFirst(v); + v = getEdgeTarget(edge); + } + path.addFirst(v); + + return isReferenceNode(v) ? path : null; + } + + private enum TraversalDirection { + downwards, + upwards + } + + /** + * Finds the path in the graph from this vertex to the reference sink, including this vertex + * + * @param start the reference vertex to start from + * @param direction describes which direction to move in the graph (i.e. down to the reference sink or up to the source) + * @return the path (non-null, non-empty) + */ + protected List getReferencePath(final MultiDeBruijnVertex start, final TraversalDirection direction) { + if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path"); + + final List path = new ArrayList<>(); + + MultiDeBruijnVertex v = start; + while ( v != null ) { + path.add(v); + v = (direction == TraversalDirection.downwards ? getNextReferenceVertex(v) : getPrevReferenceVertex(v)); + } + + return path; + } + + /** + * The base sequence for the given path. + * + * @param path the list of vertexes that make up the path + * @param reverseIfSource if true and if we encounter a source node, then reverse the character sequence for that node + * @return non-null sequence of bases corresponding to the given path + */ + @Ensures({"result != null"}) + public byte[] getBasesForPath(final List path, final boolean reverseIfSource) { + if ( path == null ) throw new IllegalArgumentException("Path cannot be null"); + + final StringBuilder sb = new StringBuilder(); + for ( final MultiDeBruijnVertex v : path ) { + if ( isSource(v) ) { + final String seq = v.getSequenceString(); + sb.append(reverseIfSource ? new StringBuilder(seq).reverse().toString() : seq); + } else { + sb.append((char)v.getSuffix()); + } + } + + return sb.toString().getBytes(); + } + + /** + * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging. + * Assumes that path1.length >= maxIndex and path2.length >= maxIndex. + * + * @param path1 the first path + * @param path2 the second path + * @param maxIndex the maximum index to traverse (not inclusive) + * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex + */ + protected static int bestPrefixMatch(final byte[] path1, final byte[] path2, final int maxIndex) { + int mismatches = 0; + int index = 0; + int lastGoodIndex = -1; + while ( index < maxIndex ) { + if ( path1[index] != path2[index] ) { + if ( ++mismatches > MAXIMUM_MISMATCHES_IN_DANGLING_HEAD_MERGE ) + return lastGoodIndex; + lastGoodIndex = index; + } + index++; + } + // if we got here then we hit the max index + return lastGoodIndex; + } + + protected boolean extendDanglingPathAgainstReference(final DanglingChainMergeHelper danglingHeadMergeResult, final int numNodesToExtend) { + + final int indexOfLastDanglingNode = danglingHeadMergeResult.danglingPath.size() - 1; + final int indexOfRefNodeToUse = indexOfLastDanglingNode + numNodesToExtend; + if ( indexOfRefNodeToUse >= danglingHeadMergeResult.referencePath.size() ) + return false; + + final MultiDeBruijnVertex danglingSource = danglingHeadMergeResult.danglingPath.remove(indexOfLastDanglingNode); + final StringBuilder sb = new StringBuilder(); + final byte[] refSourceSequence = danglingHeadMergeResult.referencePath.get(indexOfRefNodeToUse).getSequence(); + for ( int i = 0; i < numNodesToExtend; i++ ) + sb.append((char)refSourceSequence[i]); + sb.append(danglingSource.getSequenceString()); + final byte[] sequenceToExtend = sb.toString().getBytes(); + + // clean up the source and edge + final MultiSampleEdge sourceEdge = outgoingEdgeOf(danglingSource); + MultiDeBruijnVertex prevV = getEdgeTarget(sourceEdge); + removeEdge(danglingSource, prevV); + + // extend the path + for ( int i = numNodesToExtend; i > 0; i-- ) { + final MultiDeBruijnVertex newV = new MultiDeBruijnVertex(Arrays.copyOfRange(sequenceToExtend, i, i+kmerSize)); + addVertex(newV); + final MultiSampleEdge newE = addEdge(newV, prevV); + newE.setMultiplicity(sourceEdge.getMultiplicity()); + danglingHeadMergeResult.danglingPath.add(newV); + prevV = newV; + } + + return true; + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java new file mode 100644 index 000000000..150cdc826 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java @@ -0,0 +1,1015 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeRoute; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.SequenceComplexity; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * + * Threading graph subclass used to "re-thread" haplotypes instead of reads. + * + * Created with IntelliJ IDEA. + * User: valentin + * Date: 8/23/13 + * Time: 2:42 PM + * To change this template use File | Settings | File Templates. + */ +public class HaplotypeGraph extends ReadThreadingGraph { + + /** + * Maximum repeat unit length considered when looking for repeats that should not be considered as + * possible read anchor places along the reference path. + */ + protected static final int DEFAULT_MAX_REPEAT_UNIT_LENGTH = 4; + + /** + * Minimum repeat length to consider a region a repeat that should not be considered as possibl read anchor + * places along the reference path. + */ + protected static final int DEFAULT_MIN_REPEAT_LENGTH_IN_UNITS = 6; + + /** + * Reference haplotype + */ + private Haplotype referenceHaplotype; + + /** + * Reference haplotype bases + */ + private byte[] referenceBases; + + /** + * Sets of haplotypes in the graph. + */ + private Set haplotypes; + + /** + * Route of haplotypes in the graph. + */ + private HaplotypeRoute referenceRoute; + + /** + * Set of vertices along the reference route. + */ + private Set referenceVertices; + + /** + * Holds haplotype routes by haplotype. + */ + private Map haplotypeRouteByHaplotype; + + /** + * Holds haplotypes by contained vertices. + */ + private Map> haplotypesByVertex; + + /** + * Reference to the logger for this class. + */ + private static final Logger logger = Logger.getLogger(HaplotypeGraph.class); + + /** + * What is the maximum STR unit length. + */ + private int maxRepeatUnitLength = DEFAULT_MAX_REPEAT_UNIT_LENGTH; + + /** + * What is the minimum length in units for a STR. + */ + private int minRepeatLengthInUnits = DEFAULT_MIN_REPEAT_LENGTH_IN_UNITS; + + + /** + * Indicates that the haplotype data structures need update previous to querying. + */ + private boolean needToUpdateHaplotypeStructures = true; + private Set anchorableVertices; + + /** + * Constructs a haplotype graph from a describing string. + * + *

Used for testing

+ * @param string the string representation of the haplotype graph. + */ + public HaplotypeGraph(final String string) { + super(string); + haplotypes = new LinkedHashSet<>(10); + referenceVertices = Collections.emptySet(); + } + + /** + * Constructs a new haplotype graph given its kmerSize. + * + * @param kmerSize 1 or greater, the targeted kmerSize + * + * @throws IllegalArgumentException if {@code kmerSize} is 0 or negative. + */ + public HaplotypeGraph(final int kmerSize) { + super(kmerSize); + haplotypes = new LinkedHashSet<>(10); + referenceVertices = Collections.emptySet(); + } + + + /** + * Set of vertices along the reference haplotype path. + * + * @return never {@code} null but perhaps empty. + */ + public Set getReferenceVertices() { + updateHaplotypeStructures(); + return referenceVertices; + } + + /** + * Returns the haplotype route given an haplotype. + * @param haplotype query haplotype + * @throws NullPointerException if {@code haplotype} is {@code null}. + * @throws IllegalArgumentException if {@code haplotype} is not a supported haplotype in the graph. + * @return never {@code null}. + */ + public HaplotypeRoute getHaplotypeRoute(final Haplotype haplotype) { + updateHaplotypeStructures(); + if (!haplotypes.contains(haplotype)) + throw new IllegalArgumentException("input haplotype must be part of the haplotype graph haplotype set"); + HaplotypeRoute result = haplotypeRouteByHaplotype.get(haplotype); + if (result == null) + haplotypeRouteByHaplotype.put(haplotype,result = buildHaplotypeRoute(haplotype)); + return result; + } + + /** + * Creates an haplotype route. + * @param haplotype the target haplotype + * @return {@code null} if there is no such a route in the graph. + */ + private HaplotypeRoute buildHaplotypeRoute(final Haplotype haplotype) { + final Route route = RouteFinder.findRoute(this,haplotype.getBases()); + if (route == null) + return null; + else + return new HaplotypeRoute(route); + } + + /** + * Bases along the reference path. + * + * @return {@code null} if there is no reference. + */ + @SuppressWarnings("unused") + public byte[] getReferenceBases() { + updateHaplotypeStructures(); + return referenceBases; + } + + /** + * Returns the reference haplotype + * @return {@code null} if there is no such a reference. + */ + public Haplotype getReferenceHaplotype() { + updateHaplotypeStructures(); + return referenceHaplotype; + } + + + + /** + * Construct a haplotype graph given the haplotype list and the elected kmerSize. + * + * @param haplotypes whose path to add to the graph. + * @param kmerSize the kmerSize use to compose the graph. + */ + public HaplotypeGraph(final int kmerSize, final List haplotypes) { + super(kmerSize); + referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); + this.haplotypes = new LinkedHashSet<>(haplotypes); + addSequence("anonymous", referenceHaplotype.getBases(), true); + for (final Haplotype h : haplotypes) { + if (h.isReference()) + continue; + if (h.length() < kmerSize) { + Utils.warnUser(logger, "haplotype shorter than kmerSize " + h.length() + " < " + kmerSize + " will be dropped"); + } else + addSequence("anonymous", h.getBases(), false); + + } + buildGraphIfNecessary(); + } + + /** + * Returns the reference haplotype within the input collection. + * + * @param haplotypes the query haplotype set. + * @throws IllegalArgumentException if there is no reference haplotype. + * @throws NullPointerException if {@code haplotypes} is {@code null} or contains some {@code null} value. + * @return never {@code} null, a haplotype that is reference. + */ + private Haplotype findReferenceHaplotypeOrFail(final List haplotypes) { + for (final Haplotype h : haplotypes) + if (h.isReference()) + return h; + throw new IllegalArgumentException("no reference haplotype present"); + } + + /** + * Constructs a new haplotype graph given a template read-threading graph and set of haplotypes + * + * @param template the template read-threading graph. + * @param haplotypes the haplotype set to consider + */ + public HaplotypeGraph(final ReadThreadingGraph template, final List haplotypes) { + this(template.getKmerSize()); + referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); + this.haplotypes = new HashSet<>(haplotypes); + template.buildGraphIfNecessary(); + uniqueKmers = new HashMap<>(); + nonUniqueKmers = new HashSet<>(); + // Copy vertices over. + addVertices(template.vertexSet()); + // Copy edges over. + for (final MultiSampleEdge edge : template.edgeSet()) { + final MultiSampleEdge newEdge = addEdge(template.getEdgeSource(edge), template.getEdgeTarget(edge)); + newEdge.setIsRef(newEdge.isRef()); + newEdge.setMultiplicity(edge.getMultiplicity()); + } + // Copy kmer lookup tables: + uniqueKmers.putAll(template.uniqueKmers); + nonUniqueKmers.addAll(template.nonUniqueKmers); + alreadyBuilt = true; + } + + /** + * Update the haplotype data structures based in current edges and vertices. + */ + private void updateHaplotypeStructures() { + if (!needToUpdateHaplotypeStructures) + return; + needToUpdateHaplotypeStructures = false; + haplotypeRouteByHaplotype = new LinkedHashMap<>(haplotypes.size()); + final Iterator haplotypeIterator = haplotypes.iterator(); + final Set nonFoundHaplotypes = new HashSet<>(haplotypes.size()); + while (haplotypeIterator.hasNext()) { + final Haplotype haplotype = haplotypeIterator.next(); + final HaplotypeRoute haplotypeRoute = buildHaplotypeRoute(haplotype); + if (haplotypeRoute == null) { + haplotypeIterator.remove(); + nonFoundHaplotypes.add(haplotype); + if (haplotype.isReference()) { + referenceHaplotype = null; + referenceRoute = null; + referenceVertices = Collections.emptySet(); + referenceBases = null; + } + } else { + if (haplotype.isReference()) { + referenceHaplotype = haplotype; + referenceRoute = haplotypeRoute; + referenceVertices = haplotypeRoute.vertexSet(); + referenceBases = haplotypeRoute.getBases(); + } + haplotypeRouteByHaplotype.put(haplotype, haplotypeRoute); + } + } + haplotypesByVertex = buildHaplotypesByVertex(); + anchorableVertices = calculateAnchorableVertexSet(); + logger.debug("some haplotypes do not have a path across the haplotype graph " + nonFoundHaplotypes.size()); + } + + /** + * Builds a map for each vertex to all the haplotype routes that pass thru it. + */ + private Map> buildHaplotypesByVertex() { + final Map> result = new HashMap<>(referenceVertices.size()); + final Set allHaplotypeRoutes = new LinkedHashSet<>(haplotypeRouteByHaplotype.values()); + for (final HaplotypeRoute haplotypeRoute : allHaplotypeRoutes) { + final Set singleton = Collections.singleton(haplotypeRoute); + for (final MultiDeBruijnVertex vertex : haplotypeRoute.vertexSet()) + if (!result.containsKey(vertex)) + result.put(vertex, singleton); + else { + final Set currentHrs = result.get(vertex); + if (currentHrs.size() == haplotypes.size() - 1) + result.put(vertex, allHaplotypeRoutes); + else if (currentHrs.size() == 1) { + final Set newHrs = new LinkedHashSet<>(allHaplotypeRoutes.size()); + newHrs.addAll(currentHrs); + newHrs.add(haplotypeRoute); + result.put(vertex, newHrs); + } else + currentHrs.add(haplotypeRoute); + } + } + return result; + } + + + /** + * Debug convenient method to print a graph into a file in the .dot format. + * @param fileName name of the output file. + * @throws NullPointerException if {@code fileName} is {@code null}. + */ + public void printGraph(final String fileName) { + super.printGraph(new File(fileName), 10000); + } + + + + + @Override + public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) { + if ( writeHeader ) + graphWriter.println("digraph assemblyGraphs {"); + + + for( final MultiSampleEdge edge : edgeSet() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];"); + if( edge.isRef() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); + } + } + + for( final MultiDeBruijnVertex v : vertexSet() ) + graphWriter.println("\t" + v.toString() + " [label=\"" + v.getId() + ":" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]"); + + if ( writeHeader ) + graphWriter.println("}"); + } + + @Override + protected int findStart(final SequenceForKmers seqForKmers) { + return 0; + } + + /** + * Checks whether the graph has some sources or sink vertices that are not reference vertices. + * + * @return {@code true} iff so. + */ + public boolean hasNonReferenceEnds() { + for (final MultiDeBruijnVertex end : getSources()) + if (!isReferenceNode(end)) return true; + for (final MultiDeBruijnVertex end : getSinks()) + if (!isReferenceNode(end)) return true; + return false; + } + + /** + * Merges vertices that share exactly the same set of outgoing vertices. + *

+ * This is done in reversed topological order and since the graph is a DAG it ensure to return a graph + * that such merge is any longer possible. I.e. there is no need to run this method more than once. + *

+ * Notice that we will a record of distinct unique kmers that map to the same vertex that map now to the same + * merged vertex. Thus if vertices {@code X and Y} are merged then {@code findKmer(X.sequence) == findKmer(Y.sequence)}. + *

+ * Examples: + *

    + *
  • + * {@code AAA -> AAC, CAA -> AAC} would become {@code NAA -> AAC}. + *
  • + * {@code AAA -> AAC, AAA -> AAG, CAA -> AAC, CAA -> AAG} would become {@code NAA -> AAG, NAA -> AAG} + *
  • + * {@code AAA -> AAC, AAA -> AAG, CAA -> AAC} would not change as {@code AAA} and {@code CAA} + * do not share {@code AAG} as outgoing vertex. + *
  • + *
  • + * {@code AAA -> AAC, AAC -> ACA, CAA -> AAC, GAC -> ACA } would become {@code NAA -> NAC, NAC -> ACA}. + *
  • + *
+ */ + public void mergeCommonChains() { + final int vertexCount = vertexSet().size(); + final Set refVertices = new HashSet<>(vertexCount); + final Map indexByVertex = new HashMap<>(vertexCount); + final int[] pendingChildren = new int[vertexCount]; + final Deque readyVertices = new LinkedList<>(); + final Set merged = new HashSet<>(1 + vertexCount / 10 ); + + // Initialize traversal data structures. + mergeCommonChainsInitialize(refVertices, indexByVertex, pendingChildren, readyVertices); + + // Traversal in inverted topological order where children nodes are processed before their parents. + while (!readyVertices.isEmpty()) { + final MultiDeBruijnVertex currentVertex = readyVertices.remove(); + if (merged.contains(currentVertex)) continue; + + final Set mergeSet = new HashSet<>(2); + MultiDeBruijnVertex refVertex = mergeCommonChainsComposeMergeSet(refVertices, currentVertex, mergeSet); + mergeVertices(refVertex,mergeSet,indexByVertex,pendingChildren,readyVertices); + merged.addAll(mergeSet); + } + needToUpdateHaplotypeStructures = true; + } + + /** + * Given a seed vertex, determines the mergin set of nodes that will be collapsed into one. + * + * @param refVertices reference path vertices + * @param currentVertex current vertex. + * @param mergeSet where to store the final merging set. + * @return the reference node if present that needs to be preserved as such. It might be {@code null} + */ + private MultiDeBruijnVertex mergeCommonChainsComposeMergeSet(final Set refVertices, + final MultiDeBruijnVertex currentVertex, + final Set mergeSet) { + final boolean currentIsSource = isSource(currentVertex); + final Set children = outgoingVerticesOf(currentVertex); + if (children.size() == 0) + mergeSet.add(currentVertex); + else + for (final MultiDeBruijnVertex child : children) + mergeSet.addAll(incomingVerticesOf(child)); + + MultiDeBruijnVertex refVertex = refVertices.contains(currentVertex) ? currentVertex : null; + final Iterator candidatesIt = mergeSet.iterator(); + while (candidatesIt.hasNext()) { + final MultiDeBruijnVertex candidate = candidatesIt.next(); + if (candidate == currentVertex) continue; + if (isSource(candidate) != currentIsSource) { + candidatesIt.remove(); + continue; + } + if (currentIsSource && !candidate.getSequenceString().equals(currentVertex.getSequenceString())) { + candidatesIt.remove(); + continue; + } + if (!currentIsSource && candidate.getSuffix() != currentVertex.getSuffix()) { + candidatesIt.remove(); + continue; + } + final Set candidateChildren = outgoingVerticesOf(candidate); + if (candidateChildren.size() != children.size()) + candidatesIt.remove(); + else { + boolean removed = false; + for (final MultiDeBruijnVertex candidateChild : candidateChildren) + if (!children.contains(candidateChild)) { + candidatesIt.remove(); + removed = true; + break; + } + if (refVertex == null && !removed && refVertices.contains(candidate)) refVertex = candidate; + } + } + return refVertex; + } + + /** + * Initialize data-structures for {@link #mergeCommonChains} + * + * @param refVertices will contain reference path vertices. + * @param indexByVertex map vertex -> index in {@code pendingChildren}. + * @param pendingChildren number of children of a node that have not yet been processed. + * @param readyVertices vertices that are ready to be processed (all children have been processed). + */ + private void mergeCommonChainsInitialize(final Set refVertices, + final Map indexByVertex, + final int[] pendingChildren, + final Deque readyVertices) { + int nextIndex = 0; + for (final MultiDeBruijnVertex v : vertexSet()) { + indexByVertex.put(v,nextIndex++); + if (isReferenceNode(v)) refVertices.add(v); + } + + for (final Map.Entry entry : indexByVertex.entrySet()) + if ((pendingChildren[entry.getValue()] = outDegreeOf(entry.getKey())) == 0) + readyVertices.add(entry.getKey()); + } + + // Perform the actual merge. + private void mergeVertices(final MultiDeBruijnVertex refVertex, final Collection vertices, final Map indexByVertex, final int[] pendingChildrenCounts, final Deque ready) { + if (vertices.size() == 0) + throw new IllegalArgumentException(); + final MultiDeBruijnVertex vertexToKeep = refVertex == null ? vertices.iterator().next() : refVertex; + final byte[] sequence = vertexToKeep.getSequence(); + final Set uniqueKmersToUpdate = new HashSet<>(vertices.size()); + final Set parentVertices = new HashSet<>(inDegreeOf(vertexToKeep) * 2); + parentVertices.addAll(incomingVerticesOf(vertexToKeep)); + for (final MultiDeBruijnVertex p : parentVertices) + if (--pendingChildrenCounts[indexByVertex.get(p)] == 0) + ready.add(p); + + final Kmer mergedKmer = new Kmer(sequence); + if (uniqueKmers.containsKey(mergedKmer)) { + uniqueKmersToUpdate.add(new Kmer(mergedKmer.bases().clone())); + uniqueKmers.remove(mergedKmer); + } + boolean foundMergedVertex = false; + for (final MultiDeBruijnVertex v : vertices) + if (v == vertexToKeep) + foundMergedVertex = true; + else { + final byte[] seq = v.getSequence(); + final Kmer kmer = new Kmer(seq); + if (uniqueKmers.containsKey(kmer)) { + uniqueKmersToUpdate.add(kmer); + uniqueKmers.remove(kmer); + } + if (sequence.length != seq.length) throw new IllegalArgumentException("mismatched sizes " + sequence.length + " != " + + seq.length + " " + new String(sequence) + " " + new String(seq)); + for (int i = sequence.length - 1; i >= 0; i--) { + + if (sequence[i] != seq[i]) sequence[i] = 'N'; + } + for (final MultiDeBruijnVertex p : incomingVerticesOf(v)) { + if (--pendingChildrenCounts[indexByVertex.get(p)] == 0) + ready.add(p); + if (!parentVertices.contains(p)) { + parentVertices.add(p); + final MultiSampleEdge e = getEdge(p,v); + addEdge(p,vertexToKeep,new MultiSampleEdge(e.isRef(),e.getMultiplicity(),1)); + } else { + getEdge(p,vertexToKeep).incMultiplicity(getEdge(p,v).getMultiplicity()); + } + } + removeVertex(v); + } + if (!foundMergedVertex) + throw new IllegalArgumentException("merged vertex must be contained in the input set"); + for (final Kmer kmer : uniqueKmersToUpdate) + uniqueKmers.put(kmer,vertexToKeep); + } + + public Map uniqueKmerMap() { + return Collections.unmodifiableMap(uniqueKmers); + } + + @Override + public boolean equals(Object other) { + return (other instanceof HaplotypeGraph) && equals((HaplotypeGraph)other); + } + + + /** + * Simple debug representation of the haplotype graph. + * @return never {@code null} + */ + @Override + public String toString() { + return getClass().getSimpleName() + "[ks=" + kmerSize + "](vs=" + vertexSet().size() + "," + edgeSet().size() + "){...}"; + } + + /** + * Returns set of valid haplotypes. + * @return never {@code null} but perhaps empty. + */ + public Set getHaplotypes() { + updateHaplotypeStructures(); + return haplotypes; + } + + /** + * Returns a map between valid haplotypes and corresponding routes in the graph. + * @return never {@code null} but perhaps empty. + */ + public Map getHaplotypeRouteMap() { + updateHaplotypeStructures(); + return haplotypeRouteByHaplotype; + } + + /** + * Returns set of haplotype routes that enclose a vertex. + * @param vertex the query vertex. + * @return never {@code null} but perhaps empty set. + */ + public Set getEnclosingHaplotypeRoutes(final MultiDeBruijnVertex vertex) { + updateHaplotypeStructures(); + if (haplotypesByVertex == null) + return Collections.emptySet(); + final Set result = haplotypesByVertex.get(vertex); + if (result == null) + return Collections.emptySet(); + else + return result; + } + + /** + * Returns the reference route + * + * @return {@code null} if there is no valid reference haplotype. + */ + public HaplotypeRoute getReferenceRoute() { + updateHaplotypeStructures(); + return referenceRoute; + } + + /*********************************************** + * deep equals implementation, used in testing. * + ***********************************************/ + + /** + * Compare two haplotype threading graphs and it determines whether they have the same structure. + *

+ * This method goes a long way to figure out the equality and no equality of both graphs. However there + * are "pathological" case in where it might fail to see a difference. This is due to the fact that there + * is no guarantee of the uniqueness of sequences at source vertex. + *

+ * If there are more than one source vertex with the same sequence it try to match source vertices between both + * graphs matching all possible paths emanating from every pair of sources. + * + *

Note: in practice this is only used in for testing purposes + * + * @param other the other graph to compare against. + * @return never {@code null}. + */ + public boolean equals(HaplotypeGraph other) { + updateHaplotypeStructures(); + if (other == null) return false; + if (other == this) return true; + + if (!equals$ReferencePaths(this, other)) return false; + final Map thisSourcesBySequence = equalsBuildSourceBySequenceMap(this); + final Map otherSourcesBySequence = equalsBuildSourceBySequenceMap(other); + if (thisSourcesBySequence.size() != otherSourcesBySequence.size()) return false; + final List unmatchedLeft = new LinkedList<>(); + final List unmatchedRight = new LinkedList<>(); + final List> sourcePairs = equals$matchVertexBySequenceMaps(thisSourcesBySequence,otherSourcesBySequence,unmatchedLeft,unmatchedRight); + if (unmatchedLeft.size() > 0 || unmatchedRight.size() > 0) return false; + + + final Deque> pending = new LinkedList<>(sourcePairs); + final Set visited = new HashSet<>(vertexSet().size()); + while (!pending.isEmpty()) { + final Pair pair = pending.removeFirst(); + final MultiDeBruijnVertex leftVertex = pair.getFirst(); + final MultiDeBruijnVertex rightVertex = pair.getSecond(); + final List> childrenPairs = equals$matchVertexBySequenceMaps(equalsBuildChildrenBySuffixMap(this, leftVertex), + equalsBuildChildrenBySuffixMap(other, rightVertex), unmatchedLeft, unmatchedRight); + if (unmatchedLeft.size() > 0 || unmatchedRight.size() > 0) return false; + for (final Pair childPair : childrenPairs) { + final MultiDeBruijnVertex leftChild = childPair.getFirst(); + final MultiDeBruijnVertex rightChild = childPair.getSecond(); + final boolean leftVisited = visited.add(leftChild); + final boolean rightVisited = visited.add(rightChild); + if (leftVisited != rightVisited) return false; // visited before in different matchings. + if (leftVisited) continue; + pending.add(childPair); + visited.add(childPair.getFirst()); + visited.add(childPair.getSecond()); + } + } + return true; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private boolean equals$ReferencePaths(final HaplotypeGraph g1, final HaplotypeGraph g2) { + MultiDeBruijnVertex refVertex1 = g1.getReferenceSourceVertex(); + MultiDeBruijnVertex refVertex2 = g2.getReferenceSourceVertex(); + if (refVertex1 == null && refVertex2 == null) + return true; + if (refVertex1 == null || refVertex2 == null) + return false; + + if (!refVertex1.getSequenceString().equals(refVertex2.getSequenceString())) + return false; + + while (refVertex1 != null && refVertex2 != null) { + if (refVertex1.getSuffix() != refVertex2.getSuffix()) return false; + refVertex1 = g1.getNextReferenceVertex(refVertex1); + refVertex2 = g2.getNextReferenceVertex(refVertex2); + + } + return refVertex1 == refVertex2; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static Map equalsBuildChildrenBySuffixMap(final HaplotypeGraph graph, + final MultiDeBruijnVertex vertex) { + final Map result = new HashMap<>(); + for (final MultiDeBruijnVertex child : graph.outgoingVerticesOf(vertex)) + result.put(new String(new byte[]{child.getSuffix()}), child); + return result; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static List> equals$matchVertexBySequenceMaps( + final Map left, final Map right, + final Collection unmatchedLeft, final Collection unmatchedRight) { + final List> result = new LinkedList<>(); + for (final Map.Entry leftEntry : left.entrySet()) + if (right.containsKey(leftEntry.getKey())) + result.add(new Pair<>(leftEntry.getValue(),right.get(leftEntry.getKey()))); + else + unmatchedLeft.add(leftEntry.getValue()); + for (final Map.Entry rightEntry : right.entrySet()) + if (!left.containsKey(rightEntry.getKey())) + unmatchedRight.add(rightEntry.getValue()); + return result; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static Map equalsBuildSourceBySequenceMap(final HaplotypeGraph other) { + + final Set sources = other.getSources(); + final Map result = new HashMap<>(sources.size()); + final Map> collisions = new HashMap<>(sources.size()); + for (final MultiDeBruijnVertex v : sources) { + final String sequence = v.getSequenceString(); + if (result.containsKey(sequence)) { // we need to handle collision due to lack of uniqueness. + final List collisionList; + if (collisions.containsKey(sequence)) + collisionList = collisions.get(sequence); + else + collisions.put(sequence,collisionList = new LinkedList<>()); + collisionList.add(v); + } else { + result.put(sequence,v); + } + } + if (collisions.size() == 0) + return result; + for (final String s : collisions.keySet()) { + result.remove(s); + final List vertices = collisions.remove(s); + int number = 0; + final List> extendedSequences = new LinkedList<>(); + for (final MultiDeBruijnVertex vertice : vertices) + extendedSequences.add(new Pair<>(vertice, equalsCollisionResolverExtendedSequence(other, vertice))); + Collections.sort(extendedSequences,new Comparator>(){ + public int compare(final Pair p1, final Pair p2) { + return p1.getSecond().compareTo(p2.getSecond()); + } + }); + for (final Pair p : extendedSequences) + result.put(p.getSecond() + '-' + (number++),p.getFirst()); + } + return result; + + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static String equalsCollisionResolverExtendedSequence(final HaplotypeGraph graph, final MultiDeBruijnVertex source) { + final StringBuilder buffer = new StringBuilder(1000); + final Set visited = new HashSet<>(graph.vertexSet().size()); + final Stack pending = new Stack<>(); + final Stack position = new Stack<>(); + position.ensureCapacity(graph.vertexSet().size()); + pending.ensureCapacity(graph.vertexSet().size()); + pending.add(source); + position.add(0); + int lastPos = -1; + while (!pending.isEmpty()) { + final MultiDeBruijnVertex next = pending.pop(); + if (visited.contains(next)) continue; + visited.add(next); + final int pos = position.pop(); + final CharSequence sequence; + if (graph.isSource(next)) { + if (next == source) { + sequence = new String(next.getSequence()); + } else { + sequence = new StringBuffer(next.getSequence().length).append(new String(next.getSequence())).reverse().append('$'); + } + } else { + sequence = new String(new byte[] { next.getSuffix()}); + } + + if (pos != lastPos + 1) { + buffer.append('[').append(Math.abs(pos)).append(']'); + } + buffer.append(sequence); + lastPos = pos + sequence.length() - 1; + + final List parents = new LinkedList<>(graph.incomingVerticesOf(next)); + Collections.sort(parents,new Comparator() { + @Override + public int compare(final MultiDeBruijnVertex o1, final MultiDeBruijnVertex o2) { + return Byte.compare(o1.getSuffix(),o2.getSuffix()); + } + }); + for (final MultiDeBruijnVertex parent : parents) { + pending.push(parent); + position.push(lastPos + 1); + } + + final List children = new LinkedList<>(graph.incomingVerticesOf(next)); + Collections.sort(children,new Comparator() { + @Override + public int compare(final MultiDeBruijnVertex o1, final MultiDeBruijnVertex o2) { + return Byte.compare(o1.getSuffix(),o2.getSuffix()); + } + }); + for (final MultiDeBruijnVertex child : graph.outgoingVerticesOf(next)) { + pending.push(child); + position.push(lastPos + 1); + } + } + + return buffer.toString(); + } + + + /** + * Calculates the subset of reference path vertices that are amenable to be anchoring vertices. + *

+ *

+ * For a vertex to be anchorable: + *

    + *
  • Should not include bases from a repeat
  • , + *
  • There should not be in a middle of a event block
  • + *
+ *

+ * + * @return never {@code null}. + */ + private Set calculateAnchorableVertexSet() { + updateHaplotypeStructures(); + if (referenceBases == null) + return Collections.emptySet(); + + // We first check what bases in the reference path bases are part of a repeat. + final boolean[] nonAnchorableDueToRepeats = SequenceComplexity.findBasesInShortUnitRepeats( + referenceBases, maxRepeatUnitLength, minRepeatLengthInUnits); + + final Set result = new HashSet<>(100); + final Map expectedRejoins = new HashMap<>(); + + + MultiDeBruijnVertex currentVertex = getReferenceRoute().getFirstVertex(); + final int sourceSequenceLength = currentVertex.getSequence().length; + + // Determine whether the reference source vertex in anchorable discarding repeats: + boolean sourceIsAnchorable = true; + for (int i = 0; i < sourceSequenceLength; i++) + if (nonAnchorableDueToRepeats[i]) { + sourceIsAnchorable = false; + break; + } + + // Update the nonAnchorableDueToRepeats array accordingly. + int index = currentVertex.getSequence().length - 1; + nonAnchorableDueToRepeats[index] = !sourceIsAnchorable; + + + // We keep record on all alternative path lengths: + final CountSet pathLengths = new CountSet(haplotypes.size()); + pathLengths.setTo(0); + + // Now we go through the reference path and determine which vertices are not part of event block. + // We keep track of open divergent paths in expectedRejoins. Thus only those vertices traversed + // when exptectedRejoins size 0 can be anchorable: + while (currentVertex != null) { + int inDegree = inDegreeOf(currentVertex); + if (inDegree > 1) + expectedRejoins.remove(currentVertex); + if (expectedRejoins.size() == 0 && !nonAnchorableDueToRepeats[index]) { + currentVertex.setAdditionalInfo(currentVertex.additionalInfo() + "*"); + result.add(currentVertex); + } + final Set nextEdges = outgoingEdgesOf(currentVertex); + MultiDeBruijnVertex nextReferenceVertex = null; + for (final MultiSampleEdge e : nextEdges) { + final MultiDeBruijnVertex nextVertex = getEdgeTarget(e); + if (e.isRef() && referenceVertices.contains(nextVertex)) + nextReferenceVertex = nextVertex; + else + calculateRejoins(nextVertex, expectedRejoins, referenceVertices, pathLengths, false, false); + } + currentVertex = nextReferenceVertex; + index++; + } + return result; + } + + + + /** + * Returns those vertices that can be used as anchors along the refererence route. + * @return never {@code null} but perhaps empty if there is no such a vertex. + */ + public Set getAnchorableVertices() { + updateHaplotypeStructures(); + return anchorableVertices; + } + + /** + * Finds non-reference wondering paths that will rejoin the reference path from a particular node. + *

+ *

+ * It only considers those paths that rejoin within the anchor points of a read. + *

+ *

+ *

+ * Rather than reporting explicitly the path vertice sequence, this method report the length of the paths + * found. These are dumped into {@code expectedRejoins} where the keys are refernce path vertex where paths rejoin + * and the value is the set of path lengths. + *

+ *

+ *

The path lengths are calculated as the length from the startVertex plus the prefix sizes {@code prefixSizes}

+ *

+ *

You can also ask the method to exhaustively find all paths ({@code exhaustive == true}) or just consider + * intermediate nodes once ({@code exhustive == false}). If the latter only the shortest paths are considered.

+ *

+ *

Finally you also can check on paths backwards ({@code backwards == true}) or forwards ({@code backwards == false})

+ * + * @param startVertex the origin node for those paths. + * @param expectedRejoins map where to place the found paths in a form of the rejoining non-reference vertex (key) and + * set of path lengths (value). + * @param referenceWithinBoundaries reference vertices found between read anchors. The key are the vertices, the values are + * the kmer's offset in the read. + * @param prefixSizes prefix path sizes to be added to the rejoin path sizes. + * @param exhaustive whether all paths should be considered or we only care about find out the rejoining vertices. + * @param backwards whether we want to find backward paths (inverse edge traversal). + * + * Note: it is marked as deprecated as this method signature may change in the future. It is public just because + * is currently shared by several other classes, however it would not be surprising if + * it gets refactored out at some point. So use with care. + */ + @Deprecated + public void calculateRejoins(final MultiDeBruijnVertex startVertex, final Map expectedRejoins, + final Set referenceWithinBoundaries, final CountSet prefixSizes, + final boolean exhaustive, final boolean backwards) { + Queue queue = new LinkedList<>(); + Queue depths = new LinkedList<>(); + queue.add(startVertex); + depths.add(prefixSizes); + + final Set visited = new HashSet<>(); + if (!exhaustive) visited.add(startVertex); + while (!queue.isEmpty()) { + final CountSet depth = depths.remove(); + final MultiDeBruijnVertex v = queue.remove(); + if (referenceVertices.contains(v)) { + if (referenceWithinBoundaries.contains(v)) { + final CountSet previous = expectedRejoins.get(v); + if (previous == null) + expectedRejoins.put(v, depth.clone()); + else + previous.addAll(depth); + } + } else { + final CountSet depthPlusOne = depth.clone(); + depthPlusOne.incAll(1); + final Set nextEdges = backwards ? incomingEdgesOf(v) : outgoingEdgesOf(v); + for (final MultiSampleEdge e : nextEdges) { + final MultiDeBruijnVertex w = backwards ? getEdgeSource(e) : getEdgeTarget(e); + if (visited.contains(w)) // avoid repetitive work. + continue; + if (!exhaustive) visited.add(w); + queue.add(w); + depths.add(depthPlusOne); + } + } + } + } + +} + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java new file mode 100644 index 000000000..30b677fe9 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -0,0 +1,221 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public class ReadThreadingAssembler extends LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class); + + private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128; + private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000; + private final static int KMER_SIZE_ITERATION_INCREASE = 10; + private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6; + + /** The min and max kmer sizes to try when building the graph. */ + private final List kmerSizes; + private final int maxAllowedPathsForReadThreadingAssembler; + + private final boolean dontIncreaseKmerSizesForCycles; + private final int numPruningSamples; + protected boolean removePathsNotConnectedToRef = true; + private boolean justReturnRawGraph = false; + + /** for testing only */ + public ReadThreadingAssembler() { + this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25)); + } + + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes, final boolean dontIncreaseKmerSizesForCycles, final int numPruningSamples) { + super(maxAllowedPathsForReadThreadingAssembler); + this.kmerSizes = kmerSizes; + this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler; + this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles; + this.numPruningSamples = numPruningSamples; + } + + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true, 1); + } + + /** for testing purposes */ + protected void setJustReturnRawGraph(boolean justReturnRawGraph) { + this.justReturnRawGraph = justReturnRawGraph; + } + + private void addResult(final List results, final AssemblyResult maybeNullResult) { + if ( maybeNullResult != null ) + results.add(maybeNullResult); + } + + @Override + public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { + final List results = new LinkedList<>(); + + // first, try using the requested kmer sizes + for ( final int kmerSize : kmerSizes ) { + addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles)); + } + + // if none of those worked, iterate over larger sizes if allowed to do so + if ( results.isEmpty() && !dontIncreaseKmerSizesForCycles ) { + int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; + int numIterations = 1; + while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { + // on the last attempt we will allow low complexity graphs + addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT)); + kmerSize += KMER_SIZE_ITERATION_INCREASE; + numIterations++; + } + } + + return results; + } + + /** + * Creates the sequence graph for the given kmerSize + * + * @param reads reads to use + * @param refHaplotype reference haplotype + * @param kmerSize kmer size + * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph + * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs + * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity) + */ + protected AssemblyResult createGraph(final List reads, + final Haplotype refHaplotype, + final int kmerSize, + final List activeAlleleHaplotypes, + final boolean allowLowComplexityGraphs) { + if ( refHaplotype.length() < kmerSize ) { + // happens in cases where the assembled region is just too small + return new AssemblyResult(AssemblyResult.Status.FAILED, null); + } + + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples); + + // add the reference sequence to the graph + rtgraph.addSequence("ref", refHaplotype.getBases(), true); + + // add the artificial GGA haplotypes to the graph + int hapCount = 0; + for ( final Haplotype h : activeAlleleHaplotypes ) { + rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false); + } + + // Next pull kmers out of every read and throw them on the graph + for( final GATKSAMRecord read : reads ) { + rtgraph.addRead(read); + } + + // actually build the read threading graph + rtgraph.buildGraphIfNecessary(); + + // sanity check: make sure there are no cycles in the graph + if ( rtgraph.hasCycles() ) { + if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle"); + return null; + } + + // sanity check: make sure the graph had enough complexity with the given kmer + if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) { + if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity"); + return null; + } + + printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot")); + + // go through and prune all of the chains where all edges have <= pruneFactor. This must occur + // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering + // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 + rtgraph.pruneLowWeightChains(pruneFactor); + + // look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if + // we can recover them by merging some N bases from the chain back into the reference + if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor); + if ( recoverDanglingHeads ) rtgraph.recoverDanglingHeads(pruneFactor); + + // remove all heading and trailing paths + if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); + + printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot")); + + final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); + if (debugGraphTransformations) initialSeqGraph.printGraph(new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"),10000); + + // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph + if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph); + + if (debug) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); + printDebugGraphTransform(initialSeqGraph, new File( "" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot")); + initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction + + final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); + final AssemblyResult.Status status = cleaned.getStatus(); + final AssemblyResult result = new AssemblyResult(status, cleaned.getGraph()); + result.setThreadingGraph(rtgraph); + return result; + } + + @Override + public String toString() { + return "ReadThreadingAssembler{" + + "kmerSizes=" + kmerSizes + + '}'; + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java new file mode 100644 index 000000000..a7989ac2c --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -0,0 +1,792 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.jgrapht.alg.CycleDetector; + +import java.io.File; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ReadThreadingGraph extends DanglingChainMergingGraph implements KmerSearchableGraph { + + private final static Logger logger = Logger.getLogger(ReadThreadingGraph.class); + + private final static String ANONYMOUS_SAMPLE = "XXX_UNNAMED_XXX"; + private final static boolean WRITE_GRAPH = false; + private final static boolean DEBUG_NON_UNIQUE_CALC = false; + + /** for debugging info printing */ + private static int counter = 0; + + /** + * Sequences added for read threading before we've actually built the graph + */ + private final Map> pending = new LinkedHashMap<>(); + + /** + * A set of non-unique kmers that cannot be used as merge points in the graph + */ + protected Set nonUniqueKmers; + + /** + * A map from kmers -> their corresponding vertex in the graph + */ + protected Map uniqueKmers = new LinkedHashMap<>(); + + /** + * + */ + + final boolean debugGraphTransformations; + final byte minBaseQualityToUseInAssembly; + + protected boolean increaseCountsBackwards = true; + protected boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds + + // -------------------------------------------------------------------------------- + // state variables, initialized in resetToInitialState() + // -------------------------------------------------------------------------------- + private Kmer refSource; + + /** + * Constructs an empty read-threading-grpah provided the kmerSize. + * @param kmerSize 1 or greater. + * + * @throws IllegalArgumentException if (@code kmerSize) < 1. + */ + public ReadThreadingGraph(final int kmerSize) { + this(kmerSize, false, (byte)6, 1); + } + + + /** + * Return the collection of outgoing vertices that expand this vertex with a particular base. + * + * @param v original vertex. + * @param b expanding base. + * @return never null, but perhaps an empty set. You cannot assume that you can modify the result. + */ + protected Set getNextVertices(final MultiDeBruijnVertex v, final byte b) { + if (v == null) throw new IllegalArgumentException("the input vertex cannot be null"); + if (!vertexSet().contains(v)) throw new IllegalArgumentException("the vertex must be present in the graph"); + final List result = new LinkedList<>(); + for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) { + if (w.getSuffix() == b) + result.add(w); + } + switch (result.size()) { + case 0: return Collections.emptySet(); + case 1: return Collections.singleton(result.get(0)); + default: + return new HashSet<>(result); + } + } + + /** + * Create a new ReadThreadingAssembler using kmerSize for matching + * @param kmerSize must be >= 1 + */ + protected ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples) { + super(kmerSize, new MyEdgeFactory(numPruningSamples)); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("bad minkKmerSize " + kmerSize); + this.debugGraphTransformations = debugGraphTransformations; + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; + + resetToInitialState(); + } + + /** + * Reset this assembler to its initial state, so we can create another assembly with a different set of reads + */ + private void resetToInitialState() { + pending.clear(); + nonUniqueKmers = null; + uniqueKmers.clear(); + refSource = null; + alreadyBuilt = false; + } + + /** + * Add the all bases in sequence to the graph + * @param sequence a non-null sequence + * @param isRef is this the reference sequence? + */ + protected void addSequence(final byte[] sequence, final boolean isRef) { + addSequence("anonymous", sequence, isRef); + } + + /** + * Add all bases in sequence to this graph + * + * @see #addSequence(String, String, byte[], int, int, int, boolean) for full information + */ + public void addSequence(final String seqName, final byte[] sequence, final boolean isRef) { + addSequence(seqName, sequence, 1, isRef); + } + + /** + * Add all bases in sequence to this graph + * + * @see #addSequence(String, String, byte[], int, int, int, boolean) for full information + */ + public void addSequence(final String seqName, final byte[] sequence, final int count, final boolean isRef) { + addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, count, isRef); + } + + /** + * Add bases in sequence to this graph + * + * @param seqName a useful seqName for this read, for debugging purposes + * @param sequence non-null sequence of bases + * @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive + * @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive + * @param count the representative count of this sequence (to use as the weight) + * @param isRef is this the reference sequence. + */ + public void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int count, final boolean isRef) { + // note that argument testing is taken care of in SequenceForKmers + if ( alreadyBuilt ) throw new IllegalStateException("Graph already built"); + + // get the list of sequences for this sample + List sampleSequences = pending.get(sampleName); + if ( sampleSequences == null ) { // need to create + sampleSequences = new LinkedList<>(); + pending.put(sampleName, sampleSequences); + } + + // add the new sequence to the list of sequences for sample + sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, count, isRef)); + } + + /** + * Thread sequence seqForKmers through the current graph, updating the graph as appropriate + * @param seqForKmers a non-null sequence + */ + private void threadSequence(final SequenceForKmers seqForKmers) { + final int uniqueStartPos = findStart(seqForKmers); + if ( uniqueStartPos == -1 ) + return; + + final MultiDeBruijnVertex startingVertex = getOrCreateKmerVertex(seqForKmers.sequence, uniqueStartPos); + + // increase the counts of all edges incoming into the starting vertex supported by going back in sequence + if ( increaseCountsBackwards ) + increaseCountsInMatchedKmers(seqForKmers, startingVertex, startingVertex.getSequence(), kmerSize - 2); + + if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name); + + // keep track of information about the reference source + if ( seqForKmers.isRef ) { + if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex); + refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize); + } + + // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate + MultiDeBruijnVertex vertex = startingVertex; + for ( int i = uniqueStartPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) { + vertex = extendChainByOne(vertex, seqForKmers.sequence, i, seqForKmers.count, seqForKmers.isRef); + if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name); + } + } + + /** + * Find vertex and its position in seqForKmers where we should start assembling seqForKmers + * + * @param seqForKmers the sequence we want to thread into the graph + * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one + */ + protected int findStart(final SequenceForKmers seqForKmers) { + if ( seqForKmers.isRef ) + return 0; + + for ( int i = seqForKmers.start; i < seqForKmers.stop - kmerSize; i++ ) { + final Kmer kmer1 = new Kmer(seqForKmers.sequence, i, kmerSize); + if ( !nonUniqueKmers.contains(kmer1) ) + return i; + } + + return -1; + } + + /** + * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have + * been added to the graph. + */ + public void buildGraphIfNecessary() { + if ( alreadyBuilt ) return; + + // determine the kmer size we'll use, and capture the set of nonUniques for that kmer size + final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize); + nonUniqueKmers = result.nonUniques; + + if ( DEBUG_NON_UNIQUE_CALC ) { + logger.info("using " + kmerSize + " kmer size for this assembly with the following non-uniques"); + } + + // go through the pending sequences, and add them to the graph + for ( final List sequencesForSample : pending.values() ) { + for ( final SequenceForKmers sequenceForKmers : sequencesForSample ) { + threadSequence(sequenceForKmers); + if ( WRITE_GRAPH ) printGraph(new File("threading." + counter++ + "." + sequenceForKmers.name.replace(" ", "_") + ".dot"), 0); + } + + // flush the single sample edge values from the graph + for ( final MultiSampleEdge e : edgeSet() ) e.flushSingleSampleMultiplicity(); + } + + // clear + pending.clear(); + alreadyBuilt = true; + for (final MultiDeBruijnVertex v : uniqueKmers.values()) + v.setAdditionalInfo(v.additionalInfo() + "+"); + } + + + @Override + public boolean removeVertex(MultiDeBruijnVertex V) { + final boolean result = super.removeVertex(V); + if (result) { + final byte[] sequence = V.getSequence(); + final Kmer kmer = new Kmer(sequence); + uniqueKmers.remove(kmer); + } + return result; + } + + + public void removeSingletonOrphanVertices() { + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new LinkedList<>(); + for( final MultiDeBruijnVertex v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + this.removeVertex(null); + removeAllVertices(verticesToRemove); + } + + /** + * @return true if the graph has cycles, false otherwise + */ + public boolean hasCycles() { + return new CycleDetector<>(this).detectCycles(); + } + + /** + * Does the graph not have enough complexity? We define low complexity as a situation where the number + * of non-unique kmers is more than 20% of the total number of kmers. + * + * @return true if the graph has low complexity, false otherwise + */ + public boolean isLowComplexity() { + return nonUniqueKmers.size() * 4 > uniqueKmers.size(); + } + + /** structure that keeps track of the non-unique kmers for a given kmer size */ + private static class NonUniqueResult { + final Set nonUniques; + final int kmerSize; + + private NonUniqueResult(Set nonUniques, int kmerSize) { + this.nonUniques = nonUniques; + this.kmerSize = kmerSize; + } + } + + /** + * Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers + * among all sequences added to the current graph. Will always return a result for maxKmerSize if + * all smaller kmers had non-unique kmers. + * + * @param minKmerSize the minimum kmer size to consider when constructing the graph + * @param maxKmerSize the maximum kmer size to consider + * @return a non-null NonUniqueResult + */ + protected NonUniqueResult determineKmerSizeAndNonUniques(final int minKmerSize, final int maxKmerSize) { + final Collection withNonUniques = getAllPendingSequences(); + final Set nonUniqueKmers = new HashSet<>(); + + // go through the sequences and determine which kmers aren't unique within each read + int kmerSize = minKmerSize; + for ( ; kmerSize <= maxKmerSize; kmerSize++) { + // clear out set of non-unique kmers + nonUniqueKmers.clear(); + + // loop over all sequences that have non-unique kmers in them from the previous iterator + final Iterator it = withNonUniques.iterator(); + while ( it.hasNext() ) { + final SequenceForKmers sequenceForKmers = it.next(); + + // determine the non-unique kmers for this sequence + final Collection nonUniquesFromSeq = determineNonUniqueKmers(sequenceForKmers, kmerSize); + if ( nonUniquesFromSeq.isEmpty() ) { + // remove this sequence from future consideration + it.remove(); + } else { + // keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques + nonUniqueKmers.addAll(nonUniquesFromSeq); + } + } + + if ( nonUniqueKmers.isEmpty() ) + // this kmerSize produces no non-unique sequences, so go ahead and use it for our assembly + break; + } + + // necessary because the loop breaks with kmerSize = max + 1 + return new NonUniqueResult(nonUniqueKmers, Math.min(kmerSize, maxKmerSize)); + } + + /** + * Get the collection of all sequences for kmers across all samples in no particular order + * @return non-null Collection + */ + private Collection getAllPendingSequences() { + final LinkedList result = new LinkedList<>(); + for ( final List oneSampleWorth : pending.values() ) result.addAll(oneSampleWorth); + return result; + } + + /** + * Get the collection of non-unique kmers from sequence for kmer size kmerSize + * @param seqForKmers a sequence to get kmers from + * @param kmerSize the size of the kmers + * @return a non-null collection of non-unique kmers in sequence + */ + private Collection determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { + // count up occurrences of kmers within each read + final KMerCounter counter = new KMerCounter(kmerSize); + final int stopPosition = seqForKmers.stop - kmerSize; + for ( int i = 0; i <= stopPosition; i++ ) { + final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); + counter.addKmer(kmer, 1); + } + + return counter.getKmersWithCountsAtLeast(2); + } + + /** + * Convert this kmer graph to a simple sequence graph. + * + * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer + * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence + * + * @return a newly allocated SequenceGraph + */ + // TODO -- should override base class method + public SeqGraph convertToSequenceGraph() { + buildGraphIfNecessary(); + + final SeqGraph seqGraph = new SeqGraph(kmerSize); + final Map vertexMap = new HashMap<>(); + + + // create all of the equivalent seq graph vertices + for ( final MultiDeBruijnVertex dv : vertexSet() ) { + final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); + sv.setAdditionalInfo(dv.additionalInfo()); + vertexMap.put(dv, sv); + seqGraph.addVertex(sv); + } + + // walk through the nodes and connect them to their equivalent seq vertices + for( final MultiSampleEdge e : edgeSet() ) { + final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); + final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); + //logger.info("Adding edge " + seqInV + " -> " + seqOutV); + seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity())); + } + + return seqGraph; + } + + private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, + final MultiDeBruijnVertex vertex, + final byte[] originalKmer, + final int offset) { + if ( offset == -1 ) return; + + for ( final MultiSampleEdge edge : incomingEdgesOf(vertex) ) { + final MultiDeBruijnVertex prev = getEdgeSource(edge); + final byte suffix = prev.getSuffix(); + final byte seqBase = originalKmer[offset]; +// logger.warn(String.format("Increasing counts for %s -> %s via %s at %d with suffix %s vs. %s", +// prev, vertex, edge, offset, (char)suffix, (char)seqBase)); + if ( suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1) ) { + edge.incMultiplicity(seqForKmers.count); + increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset-1); + } + } + } + + /** + * Get the vertex for the kmer in sequence starting at start + * @param sequence the sequence + * @param start the position of the kmer start + * @return a non-null vertex + */ + private MultiDeBruijnVertex getOrCreateKmerVertex(final byte[] sequence, final int start) { + final Kmer kmer = new Kmer(sequence, start, kmerSize); + final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, true); + return ( vertex != null ) ? vertex : createVertex(kmer); + } + + /** + * Get the unique vertex for kmer, or null if not possible. + * + * @param allowRefSource if true, we will allow kmer to match the reference source vertex + * @return a vertex for kmer, or null if it's not unique + */ + private MultiDeBruijnVertex getUniqueKmerVertex(final Kmer kmer, final boolean allowRefSource) { + if ( ! allowRefSource && kmer.equals(refSource) ) return null; + + return uniqueKmers.get(kmer); + } + + + /** + * Create a new vertex for kmer. Add it to the uniqueKmers map if appropriate. + * + * kmer must not have a entry in unique kmers, or an error will be thrown + * + * @param kmer the kmer we want to create a vertex for + * @return the non-null created vertex + */ + private MultiDeBruijnVertex createVertex(final Kmer kmer) { + final MultiDeBruijnVertex newVertex = new MultiDeBruijnVertex(kmer.bases()); + final int prevSize = vertexSet().size(); + addVertex(newVertex); + + // make sure we aren't adding duplicates (would be a bug) + if ( vertexSet().size() != prevSize + 1) throw new IllegalStateException("Adding vertex " + newVertex + " to graph didn't increase the graph size"); + + // add the vertex to the unique kmer map, if it is in fact unique + if ( ! nonUniqueKmers.contains(kmer) && ! uniqueKmers.containsKey(kmer) ) // TODO -- not sure this last test is necessary + uniqueKmers.put(kmer, newVertex); + + return newVertex; + } + + /** + * Workhorse routine of the assembler. Given a sequence whose last vertex is anchored in the graph, extend + * the graph one bp according to the bases in sequence. + * + * @param prevVertex a non-null vertex where sequence was last anchored in the graph + * @param sequence the sequence we're threading through the graph + * @param kmerStart the start of the current kmer in graph we'd like to add + * @param count the number of observations of this kmer in graph (can be > 1 for GGA) + * @param isRef is this the reference sequence? + * @return a non-null vertex connecting prevVertex to in the graph based on sequence + */ + private MultiDeBruijnVertex extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef) { + final Set outgoingEdges = outgoingEdgesOf(prevVertex); + + final int nextPos = kmerStart + kmerSize - 1; + for ( final MultiSampleEdge outgoingEdge : outgoingEdges ) { + final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge); + if ( target.getSuffix() == sequence[nextPos] ) { + // we've got a match in the chain, so simply increase the count of the edge by 1 and continue + outgoingEdge.incMultiplicity(count); + return target; + } + } + + // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in + final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize); + final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false); + + if ( isRef && uniqueMergeVertex != null ) + throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex); + + // either use our unique merge vertex, or create a new one in the chain + final MultiDeBruijnVertex nextVertex = uniqueMergeVertex == null ? createVertex(kmer) : uniqueMergeVertex; + addEdge(prevVertex, nextVertex, ((MyEdgeFactory)getEdgeFactory()).createEdge(isRef, count)); + return nextVertex; + } + + /** + * Add the given read to the sequence graph. Ultimately the read will get sent through addSequence(), but first + * this method ensures we only use high quality bases and accounts for reduced reads, etc. + * + * @param read a non-null read + */ + protected void addRead(final GATKSAMRecord read) { + final byte[] sequence = read.getReadBases(); + final byte[] qualities = read.getBaseQualities(); + + int lastGood = -1; // the index of the last good base we've seen + for( int end = 0; end <= sequence.length; end++ ) { + if ( end == sequence.length || ! baseIsUsableForAssembly(sequence[end], qualities[end]) ) { + // the first good base is at lastGood, can be -1 if last base was bad + final int start = lastGood; + // the stop base is end - 1 (if we're not at the end of the sequence) + final int len = end - start; + + if ( start != -1 && len >= kmerSize ) { + // if the sequence is long enough to get some value out of, add it to the graph + final String name = read.getReadName() + "_" + start + "_" + end; + addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, 1, false); + } + + lastGood = -1; // reset the last good base + } else if ( lastGood == -1 ) { + lastGood = end; // we're at a good base, the last good one is us + } + } + } + + /** + * Determines whether a base can safely be used for assembly. + * Currently disallows Ns and/or those with low quality + * + * @param base the base under consideration + * @param qual the quality of that base + * @return true if the base can be used for assembly, false otherwise + */ + protected boolean baseIsUsableForAssembly(final byte base, final byte qual) { + return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly; + } + + /** + * Get the set of non-unique kmers in this graph. For debugging purposes + * @return a non-null set of kmers + */ + protected Set getNonUniqueKmers() { + return nonUniqueKmers; + } + + @Override + public String toString() { + return "ReadThreadingAssembler{" + + "kmerSize=" + kmerSize + + '}'; + } + + + @Override + public MultiDeBruijnVertex findKmer(final Kmer k) { + return uniqueKmers.get(k); + } + + /************************************************************* + * Simple string representation support for testing purposes * + *************************************************************/ + + private static final Pattern PROPERTIES_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*\\]"); + private static final Pattern PATH_PATTERN = Pattern.compile("\\{((\\S+):)?([^\\}]*)\\}"); + private static final Pattern KMERSIZE_EXTRACTOR_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*(ks|kmerSize)\\s*=\\s*(\\d+)\\s*[,\\]]"); + + + /** + * Constructs a read-threading-graph for a string representation. + * + *

+ * Note: only used for testing. + * Checkout {@see HaplotypeGraphUnitTest} for examples. + *

+ * @param s the string representation of the graph {@code null}. + */ + public ReadThreadingGraph(final String s) { + super(kmerSizeFromString(s),new MyEdgeFactory(1)); + debugGraphTransformations = false; + minBaseQualityToUseInAssembly = 0; + applyString(s); + alreadyBuilt = true; + } + + /** + * Obtain the kmer size for the string representation. + * @param str the source string representation. + * @return 1 or greater. + * @throws IllegalArgumentException if {@code} str does not contain a valid representation. + */ + private static int kmerSizeFromString(final String str) { + final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str); + if (matcher.find()) { + return Integer.parseInt(matcher.group(2)); + } else + throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize"); + } + + /** + * Apply description string into the graph. + * + *

+ * Note: this is done just for testing purposes. + * Checkout {@see HaplotypeGraphUnitTest} for examples. + *

+ * @param str the string representation. + */ + private void applyString(final String str) { + final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str); + final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0; + + final String pathString = str.substring(pathStart); + final Matcher pathMatcher = PATH_PATTERN.matcher(pathString); + + boolean referenceFound = false; + final Map vertexById = new HashMap<>(); + + // Loop between path strings and add them one by one. + while (pathMatcher.find()) { + final String label = pathMatcher.group(2); + final boolean isReference = (label != null && label.equals("REF")); + if (referenceFound) { + if (isReference) + throw new IllegalArgumentException("there are two reference paths"); + } else if ( isReference ) { + referenceFound = true; + } + + // Divide each path into its elements getting a list of sequences and labels if applies: + final String elementsString = pathMatcher.group(3); + final String[] elements = elementsString.split("\\s*->\\s*"); + if (elements.length == 0) + throw new IllegalArgumentException("empty path not allowed"); + final String[] seqs = new String[elements.length]; + final String[] ids = new String[elements.length]; + for (int i = 0; i < elements.length; i++) { + ids[i] = pathElementId(elements[i]); + seqs[i] = pathElementSeq(elements[i]); + if (seqs[i].isEmpty() && ids[i] == null) + throw new IllegalArgumentException("path with empty element without an id"); + } + final boolean isSource = ids[0] == null || !vertexById.containsKey(ids[0]); + if (isSource && seqs[0].length() != kmerSize) + throw new IllegalArgumentException("source sequence length must be the same as the kmerSize " + + ids[0] + " " + seqs[0] + " " + pathMatcher.group()); + final MultiDeBruijnVertex firstVertex; + if (ids[0] != null && vertexById.containsKey(ids[0])) + firstVertex = vertexById.get(ids[0]); + else { + firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes()); + addVertex(firstVertex); + if (ids[0] != null) + vertexById.put(ids[0],firstVertex); + } + if (!seqs[0].isEmpty() && + ((isSource && !firstVertex.getSequenceString().equals(seqs[0])) + || (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0]))) + throw new IllegalArgumentException("mismatched first element sequence"); + + MultiDeBruijnVertex lastVertex = firstVertex; + for (int i = 1; i < elements.length; i++) { + if (seqs[i].length() > 1) + throw new IllegalArgumentException("non-source vertex sequence must have length 1"); + final MultiDeBruijnVertex nextVertex; + if (ids[i] == null || !vertexById.containsKey(ids[i])) { + final Set nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]); + if (nextVertices.size() == 0) { + nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0])); + addVertex(nextVertex); + } else { + nextVertex = nextVertices.iterator().next(); + } + if (ids[i] != null) + vertexById.put(ids[i],nextVertex); + } else { + nextVertex = vertexById.get(ids[i]); + } + final MultiSampleEdge edge = addEdge(lastVertex,nextVertex); + if (isReference) edge.setIsRef(true); + lastVertex = nextVertex; + } + } + } + + private static String pathElementId(final String element) { + final int parentesysPos = element.indexOf('('); + + if (parentesysPos == -1) + return null; + + final int closeParentesysPos = element.lastIndexOf(')'); + if (closeParentesysPos == -1) + throw new IllegalArgumentException("non-closed id parantesys found in element: " + element); + final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim(); + if (result.isEmpty()) + throw new IllegalArgumentException("empty id found in element: " + element); + return result; + } + + /** + * Returns the lenght of a path element in the string representation. + * @param element the query element. + * @return 0 or greater. + */ + private static String pathElementSeq(final String element) { + final int parentesysPos = element.indexOf('('); + + if (parentesysPos == -1) + return element.trim(); + + return element.substring(0,parentesysPos).trim(); + } + + /** + * Add a base to the end of a byte sequence. + * @param sequence sequence where to add the base to. + * @param b base to add. + * @return never {@code null}, a new array each time. + */ + private static byte[] extendSequence(final byte[] sequence, final byte b) { + final byte[] result = new byte[sequence.length]; + System.arraycopy(sequence,1,result,0,sequence.length - 1); + result[result.length - 1] = b; + return result; + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java new file mode 100644 index 000000000..e55772657 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +/** + * Keeps track of the information needed to add a sequence to the read threading assembly graph + * + * User: depristo + * Date: 4/18/13 + * Time: 8:59 AM + * To change this template use File | Settings | File Templates. + */ +final class SequenceForKmers { + final String name; + final byte[] sequence; + final int start, stop; + final int count; + final boolean isRef; + + /** + * Create a new sequence for creating kmers + */ + SequenceForKmers(final String name, byte[] sequence, int start, int stop, int count, boolean ref) { + if ( start < 0 ) throw new IllegalArgumentException("Invalid start " + start); + if ( stop < start ) throw new IllegalArgumentException("Invalid stop " + stop); + if ( sequence == null ) throw new IllegalArgumentException("Sequence is null "); + if ( count < 1 ) throw new IllegalArgumentException("Invalid count " + count); + + this.name = name; + this.sequence = sequence; + this.start = start; + this.stop = stop; + this.count = count; + this.isRef = ref; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java new file mode 100644 index 000000000..a9b14e40b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -0,0 +1,1645 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import net.sf.samtools.util.RuntimeIOException; +import net.sf.samtools.util.SequenceUtil; +import net.sf.samtools.util.StringUtil; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.smithwaterman.Parameters; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; + +/** + * Performs local realignment of reads to correct misalignments due to the presence of indels. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *
    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. + *
  3. Running the realigner over those intervals (IndelRealigner)
  4. + *
+ *

+ * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 + *

+ * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A realigned version of your input BAM file(s). + *

+ * + *

Example

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T IndelRealigner \
+ *   -R ref.fasta \
+ *   -I input.bam \
+ *   -targetIntervals intervalListFromRTC.intervals \
+ *   -o realignedBam.bam \
+ *   [-known /path/to/indels.vcf] \
+ *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
+ * 
+ * + *

Caveats

+ * + *
  • + * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + *
  • + * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + *
+ * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) +public class IndelRealigner extends ReadWalker { + + public static final String ORIGINAL_CIGAR_TAG = "OC"; + public static final String ORIGINAL_POSITION_TAG = "OP"; + public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; + + public enum ConsensusDeterminationModel { + /** + * Uses only indels from a provided ROD of known indels. + */ + KNOWNS_ONLY, + /** + * Additionally uses indels already present in the original alignments of the reads. + */ + USE_READS, + /** + * Additionally uses 'Smith-Waterman' to generate alternate consenses. + */ + USE_SW + } + + /** + * Any number of VCF files representing known indels to be used for constructing alternate consenses. + * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. + */ + @Input(fullName="knownAlleles", shortName = "known", doc="Input VCF file(s) with known indels", required=false) + public List> known = Collections.emptyList(); + + /** + * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). + */ + @Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true) + protected IntervalBinding intervalsFile = null; + + /** + * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number + * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, + * this number should be smaller. + */ + @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) + protected double LOD_THRESHOLD = 5.0; + + /** + * The realigned bam file. + */ + @Output(required=false, doc="Output bam", defaultToStdout=false) + protected StingSAMFileWriter writer = null; + protected ConstrainedMateFixingManager manager = null; + protected SAMFileWriter writerToUse = null; + + /** + * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; + * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). + */ + @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) + public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; + + + // ADVANCED OPTIONS FOLLOW + + /** + * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner + * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply + * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. + */ + @Advanced + @Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false) + protected double MISMATCH_THRESHOLD = 0.15; + + /** + * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; + * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. + */ + @Advanced + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) + protected int MAX_RECORDS_IN_MEMORY = 150000; + + /** + * For expert users only! + */ + @Advanced + @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) + protected int MAX_ISIZE_FOR_MOVEMENT = 3000; + + /** + * For expert users only! + */ + @Advanced + @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false) + protected int MAX_POS_MOVE_ALLOWED = 200; + + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced + @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) + protected int MAX_CONSENSUSES = 30; + + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced + @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) + protected int MAX_READS_FOR_CONSENSUSES = 120; + + /** + * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. + * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. + */ + @Advanced + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false) + protected int MAX_READS = 20000; + + @Advanced + @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") + protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; + + /** + * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that + * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), + * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the + * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output + * file name (2nd column) for each input file name (1st column). + * + * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). + */ + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)") + protected String N_WAY_OUT = null; + + @Hidden + @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs") + protected boolean generateMD5s = false; + + // DEBUGGING OPTIONS FOLLOW + + @Hidden + @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") + protected boolean CHECKEARLY = false; + + @Hidden + @Argument(fullName="noPGTag", shortName="noPG", required=false, + doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + protected boolean NO_PG_TAG = false; + + @Hidden + @Argument(fullName="keepPGTags", shortName="keepPG", required=false, + doc="Keep older PG tags left in the bam header by previous runs of this tool (by default, all these "+ + "historical tags will be replaced by the latest tag generated in the current run).") + protected boolean KEEP_ALL_PG_RECORDS = false; + + @Hidden + @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") + protected String OUT_INDELS = null; + + @Hidden + @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) + protected String OUT_STATS = null; + + @Hidden + @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) + protected String OUT_SNPS = null; + + // fasta reference reader to supplement the edges of the reference sequence + private CachingIndexedFastaSequenceFile referenceReader; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + private boolean sawReadInCurrentInterval = false; + + // the reads and known indels that fall into the current interval + private ReadBin readsToClean; + private final ArrayList readsNotToClean = new ArrayList(); + private final ArrayList knownIndelsToTry = new ArrayList(); + private final HashSet indelRodsSeen = new HashSet(); + private final HashSet readsActuallyCleaned = new HashSet(); + + private static final int MAX_QUAL = 99; + + // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned + private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; + + private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0); + + // reference base padding size + // TODO -- make this a command-line argument if the need arises + private static final int REFERENCE_PADDING = 30; + + // other output files + private FileWriter indelOutput = null; + private FileWriter statsOutput = null; + private FileWriter snpsOutput = null; + + //###protected Map nwayWriters = null; + + + // debug info for lazy SW evaluation: + private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had + private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment + private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus) + + private Map loadFileNameMap(String mapFile) { + Map fname_map = new HashMap(); + + try { + + XReadLines reader = new XReadLines(new File(mapFile),true); + for ( String line : reader ) { + if ( line.length() == 0 ) continue; + + String fields[] = line.split("\t"); + + if ( fields.length != 2 ) + throw new UserException.BadInput("Input-output map file must have exactly two columns. Offending line:\n"+line); + if ( fields[0].length() == 0 || fields[1].length() == 0 ) + throw new UserException.BadInput("Input-output map file can not have empty strings in either column. Offending line:\n"+line); + + if ( fname_map.containsKey(fields[0]) ) + throw new UserException.BadInput("Input-output map file contains duplicate entries for input name "+fields[0]); + if ( fname_map.containsValue(fields[1]) ) + throw new UserException.BadInput("Input-output map file maps multiple entries onto single output name "+fields[1]); + + fname_map.put(fields[0],fields[1]); + } + } catch (IOException e) { + throw new StingException("I/O Error while reading input-output map file "+N_WAY_OUT+": "+e.getMessage()); + } + return fname_map; + } + + public void initialize() { + readsToClean = new ReadBin(getToolkit().getGenomeLocParser(), REFERENCE_PADDING); + + if ( N_WAY_OUT == null && writer == null ) { + throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); + } + if ( N_WAY_OUT != null && writer != null ) { + throw new UserException.CommandLineException("-o and -nWayOut can not be used simultaneously"); + } + if ( LOD_THRESHOLD < 0.0 ) + throw new RuntimeException("LOD threshold cannot be a negative number"); + if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) + throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); + + try { + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); + } + + intervals = intervalsFile.getIntervals(getToolkit()).iterator(); + + currentInterval = intervals.hasNext() ? intervals.next() : null; + + if ( N_WAY_OUT != null ) { + boolean createIndex = true; + + if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { + writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT), + SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); + } else { + writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, + createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); + } + } else { + // set up the output writer + setupWriter(getToolkit().getSAMFileHeader()); + writerToUse = writer; + } + manager = new ConstrainedMateFixingManager(writerToUse, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY); + + if ( OUT_INDELS != null ) { + try { + indelOutput = new FileWriter(new File(OUT_INDELS)); + } catch (Exception e) { + logger.error("Failed to create output file "+ OUT_INDELS+". Indel output will be suppressed"); + logger.error(e.getMessage()); + indelOutput = null; + } + } + if ( OUT_STATS != null ) { + try { + statsOutput = new FileWriter(new File(OUT_STATS)); + } catch (Exception e) { + logger.error("Failed to create output file "+ OUT_STATS+". Cleaning stats output will be suppressed"); + logger.error(e.getMessage()); + statsOutput = null; + } + } + if ( OUT_SNPS != null ) { + try { + snpsOutput = new FileWriter(new File(OUT_SNPS)); + } catch (Exception e) { + logger.error("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed"); + logger.error(e.getMessage()); + snpsOutput = null; + } + } + } + + private void setupWriter(SAMFileHeader header) { + + if ( !NO_PG_TAG ) { + final SAMProgramRecord programRecord = createProgramRecord(); + + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } + + writer.writeHeader(header); + writer.setPresorted(true); + } + + + private SAMProgramRecord createProgramRecord() { + if ( NO_PG_TAG ) return null; + + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + try { + final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); + programRecord.setProgramVersion(version); + } catch (MissingResourceException e) { + // this is left empty on purpose (perhaps Andrey knows why?) + } + programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + return programRecord; + } + + private void emit(final GATKSAMRecord read) { + + // check to see whether the read was modified by looking at the temporary tag + boolean wasModified = readsActuallyCleaned.contains(read); + + try { + manager.addRead(read, wasModified); + } catch (RuntimeIOException e) { + throw new UserException.ErrorWritingBamFile(e.getMessage()); + } + } + + private void emitReadLists() { + // pre-merge lists to sort them in preparation for constrained SAMFileWriter + readsNotToClean.addAll(readsToClean.getReads()); + ReadUtils.sortReadsByCoordinate(readsNotToClean); + manager.addReads(readsNotToClean, readsActuallyCleaned); + readsToClean.clear(); + readsNotToClean.clear(); + readsActuallyCleaned.clear(); + } + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( currentInterval == null ) { + emit(read); + return 0; + } + + // edge case: when the last target interval abuts the end of the genome, we'll get one of the + // unmapped reads while the currentInterval still isn't null. We need to trigger the cleaning + // at this point without trying to create a GenomeLoc. + if ( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ) { + cleanAndCallMap(ref, read, metaDataTracker, null); + return 0; + } + + GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.isBefore(currentInterval) ) { + if ( !sawReadInCurrentInterval ) + emit(read); + else + readsNotToClean.add(read); + } + else if ( readLoc.overlapsP(currentInterval) ) { + sawReadInCurrentInterval = true; + + if ( doNotTryToClean(read) ) { + readsNotToClean.add(read); + } else { + readsToClean.add(read); + + // add the rods to the list of known variants + populateKnownIndels(metaDataTracker); + } + + if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) { + logger.info("Not attempting realignment in interval " + currentInterval + " because there are too many reads."); + abortCleanForCurrentInterval(); + } + } + else { // the read is past the current interval + logger.debug(currentInterval.toString() + "\t" + read.getAlignmentStart() ); + cleanAndCallMap(ref, read, metaDataTracker, readLoc); + } + + return 0; + } + + private void abortCleanForCurrentInterval() { + emitReadLists(); + currentInterval = intervals.hasNext() ? intervals.next() : null; + sawReadInCurrentInterval = false; + } + + private boolean doNotTryToClean(GATKSAMRecord read) { + return read.getReadUnmappedFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + ConstrainedMateFixingManager.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT) || + ReadUtils.is454Read(read) || + ReadUtils.isIonRead(read); + // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses + } + + private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { + if ( readsToClean.size() > 0 ) { + GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); + if ( manager.canMoveReads(earliestPossibleMove) ) + clean(readsToClean); + } + knownIndelsToTry.clear(); + indelRodsSeen.clear(); + + emitReadLists(); + try { + do { + currentInterval = intervals.hasNext() ? intervals.next() : null; + + } while ( currentInterval != null && (readLoc == null || currentInterval.isBefore(readLoc)) ); + } catch (ReviewedStingException e) { + throw new UserException.MissortedFile(new File(intervalsFile.getSource()), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); + } + sawReadInCurrentInterval = false; + + // call back into map now that the state has been updated + map(ref, read, metaDataTracker); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + if ( readsToClean.size() > 0 ) { + GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); + if ( manager.canMoveReads(earliestPossibleMove) ) + clean(readsToClean); + emitReadLists(); + } else if ( readsNotToClean.size() > 0 ) { + emitReadLists(); + } + + knownIndelsToTry.clear(); + indelRodsSeen.clear(); + + if ( OUT_INDELS != null ) { + try { + indelOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_INDELS+" gracefully. Data may be corrupt."); + } + } + if ( OUT_STATS != null ) { + try { + statsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt."); + } + } + if ( OUT_SNPS != null ) { + try { + snpsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt."); + } + } + + manager.close(); + if ( N_WAY_OUT != null ) writerToUse.close(); + + if ( CHECKEARLY ) { + logger.info("SW alignments runs: "+SWalignmentRuns); + logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)"); + logger.info("SW alignments skipped (perfect match): "+exactMatchesFound); + logger.info("Total reads SW worked for: "+(SWalignmentSuccess + exactMatchesFound)+ + " ("+(SWalignmentSuccess+exactMatchesFound)/(SWalignmentRuns+exactMatchesFound)+"% of all reads requiring SW)"); + } + } + + private void populateKnownIndels(RefMetaDataTracker metaDataTracker) { + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( indelRodsSeen.contains(vc) ) + continue; + indelRodsSeen.add(vc); + knownIndelsToTry.add(vc); + } + } + + private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex, int quitAboveThisValue) { + final byte[] readSeq = aRead.getReadBases(); + final byte[] quals = aRead.getBaseQualities(); + int sum = 0; + for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) { + if ( refIndex >= refSeq.length ) { + sum += MAX_QUAL; + // optimization: once we pass the threshold, stop calculating + if ( sum > quitAboveThisValue ) + return sum; + } else { + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIndex]; + if ( !BaseUtils.isRegularBase(readChr) || !BaseUtils.isRegularBase(refChr) ) + continue; // do not count Ns/Xs/etc ? + if ( readChr != refChr ) { + sum += (int)quals[readIndex]; + // optimization: once we pass the threshold, stop calculating + if ( sum > quitAboveThisValue ) + return sum; + } + } + } + return sum; + } + + private void clean(ReadBin readsToClean) { + + final List reads = readsToClean.getReads(); + if ( reads.size() == 0 ) + return; + + byte[] reference = readsToClean.getReference(referenceReader); + int leftmostIndex = readsToClean.getLocation().getStart(); + + final ArrayList refReads = new ArrayList(); // reads that perfectly match ref + final ArrayList altReads = new ArrayList(); // reads that don't perfectly match + final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? + final Set altConsenses = new LinkedHashSet(); // list of alt consenses + + // if there are any known indels for this region, get them and create alternate consenses + generateAlternateConsensesFromKnownIndels(altConsenses, leftmostIndex, reference); + + // decide which reads potentially need to be cleaned; + // if there are reads with a single indel in them, add that indel to the list of alternate consenses + long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference); + + // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference, using totalRawMismatchSum as the random seed + if ( consensusModel == ConsensusDeterminationModel.USE_SW ) + generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex); + + // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); + + Consensus bestConsensus = null; + + for (Consensus consensus : altConsenses) { + //logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str)); + +// if ( DEBUG ) { +// System.out.println("Checking consensus with alignment at "+consensus.positionOnReference+" cigar "+consensus.cigar); +// System.out.println(new String(consensus.str)); +// int z = 0; +// for ( ; z < consensus.positionOnReference; z++ ) System.out.print('.'); +// for ( z=0 ; z < consensus.cigar.getCigarElement(0).getLength() ; z++ ) System.out.print('.'); +// if ( consensus.cigar.getCigarElement(1).getOperator() == CigarOperator.I ) for ( z= 0; z < consensus.cigar.getCigarElement(1).getLength(); z++ ) System.out.print('I'); +// System.out.println(); +// } + + // if ( debugOn ) System.out.println("Consensus: "+consensus.str); + + for (int j = 0; j < altReads.size(); j++) { + AlignedRead toTest = altReads.get(j); + Pair altAlignment = findBestOffset(consensus.str, toTest, leftmostIndex); + + // the mismatch score is the min of its alignment vs. the reference and vs. the alternate + int myScore = altAlignment.second; + + if (myScore > toTest.getAlignerMismatchScore() || myScore >= toTest.getMismatchScoreToReference()) + myScore = toTest.getMismatchScoreToReference(); + // keep track of reads that align better to the alternate consensus. + // By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het) + else + consensus.readIndexes.add(new Pair(j, altAlignment.first)); + + //logger.debug(consensus.cigar + " vs. " + toTest.getRead().getReadName() + "-" + toTest.getRead().getReadString() + " => " + myScore + " vs. " + toTest.getMismatchScoreToReference()); + if (!toTest.getRead().getDuplicateReadFlag()) + consensus.mismatchSum += myScore; + + // optimization: once the mismatch sum is higher than the best consensus, quit since this one can't win + // THIS MUST BE DISABLED IF WE DECIDE TO ALLOW MORE THAN ONE ALTERNATE CONSENSUS! + if (bestConsensus != null && consensus.mismatchSum > bestConsensus.mismatchSum) + break; + } + + //logger.debug("Mismatch sum of new consensus: " + consensus.mismatchSum); + if (bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { + // we do not need this alt consensus, release memory right away!! + if (bestConsensus != null) + bestConsensus.readIndexes.clear(); + bestConsensus = consensus; + //logger.debug("New consensus " + bestConsensus.cigar + " is now best consensus"); + } else { + // we do not need this alt consensus, release memory right away!! + consensus.readIndexes.clear(); + } + } + + // if: + // 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads, + // 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads, + // 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy), + // then clean! + final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0); + if ( improvement >= LOD_THRESHOLD ) { + + bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference, true); + + // start cleaning the appropriate reads + for ( Pair indexPair : bestConsensus.readIndexes ) { + AlignedRead aRead = altReads.get(indexPair.first); + if ( !updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, leftmostIndex) ) + return; + } + if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { + if ( statsOutput != null ) { + try { + statsOutput.write(currentInterval.toString()); + statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist) + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); + } + } + } else { + //logger.debug("CLEAN: " + bestConsensus.cigar + " " + bestConsensus.str.toString() + " " + bestConsensus.cigar.numCigarElements() ); + if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { + // NOTE: indels are printed out in the format specified for the low-coverage pilot1 + // indel calls (tab-delimited): chr position size type sequence + StringBuilder str = new StringBuilder(); + str.append(reads.get(0).getReferenceName()); + int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength(); + str.append("\t").append(leftmostIndex + position - 1); + CigarElement ce = bestConsensus.cigar.getCigarElement(1); + str.append("\t").append(ce.getLength()).append("\t").append(ce.getOperator()).append("\t"); + int length = ce.getLength(); + if ( ce.getOperator() == CigarOperator.D ) { + for ( int i = 0; i < length; i++) + str.append((char)reference[position+i]); + } else { + for ( int i = 0; i < length; i++) + str.append((char)bestConsensus.str[position+i]); + } + str.append("\t").append((((double) (totalRawMismatchSum - bestConsensus.mismatchSum)) / 10.0)).append("\n"); + try { + indelOutput.write(str.toString()); + indelOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("indelOutput", "Failed to write indel output file", e); + } + } + if ( statsOutput != null ) { + try { + statsOutput.write(currentInterval.toString()); + statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced + if ( bestConsensus.cigar.numCigarElements() > 1 ) + statsOutput.write(" (found indel)"); + statsOutput.write("\t"); + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); + } + } + + // finish cleaning the appropriate reads + for ( Pair indexPair : bestConsensus.readIndexes ) { + final AlignedRead aRead = altReads.get(indexPair.first); + if ( aRead.finalizeUpdate() ) { + // We need to update the mapping quality score of the cleaned reads; + // however we don't have enough info to use the proper MAQ scoring system. + // For now, we will just arbitrarily add 10 to the mapping quality. [EB, 6/7/2010]. + // TODO -- we need a better solution here + GATKSAMRecord read = aRead.getRead(); + if ( read.getMappingQuality() != 255 ) // 255 == Unknown, so don't modify it + read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254)); + + // before we fix the attribute tags we first need to make sure we have enough of the reference sequence + int neededBasesToLeft = leftmostIndex - read.getAlignmentStart(); + int neededBasesToRight = read.getAlignmentEnd() - leftmostIndex - reference.length + 1; + int neededBases = Math.max(neededBasesToLeft, neededBasesToRight); + if ( neededBases > 0 ) { + int padLeft = Math.max(leftmostIndex-neededBases, 1); + int padRight = Math.min(leftmostIndex+reference.length+neededBases, referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength()); + reference = referenceReader.getSubsequenceAt(currentInterval.getContig(), padLeft, padRight).getBases(); + leftmostIndex = padLeft; + } + + // now, fix the attribute tags + // TODO -- get rid of this try block when Picard does the right thing for reads aligned off the end of the reference + try { + if ( read.getAttribute(SAMTag.NM.name()) != null ) + read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex - 1)); + if ( read.getAttribute(SAMTag.UQ.name()) != null ) + read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, reference, leftmostIndex-1)); + } catch (Exception e) { + // ignore it + } + // TODO -- this is only temporary until Tim adds code to recalculate this value + if ( read.getAttribute(SAMTag.MD.name()) != null ) + read.setAttribute(SAMTag.MD.name(), null); + + // mark that it was actually cleaned + readsActuallyCleaned.add(read); + } + } + } + + // END IF ( improvement >= LOD_THRESHOLD ) + + } else if ( statsOutput != null ) { + try { + statsOutput.write(String.format("%s\tFAIL\t%.1f%n", + currentInterval.toString(), improvement)); + statsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); + } + } + } + + private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final int leftmostIndex, final byte[] reference) { + for ( VariantContext knownIndel : knownIndelsToTry ) { + if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) + continue; + final byte[] indelStr; + if ( knownIndel.isSimpleInsertion() ) { + final byte[] fullAllele = knownIndel.getAlternateAllele(0).getBases(); + indelStr = Arrays.copyOfRange(fullAllele, 1, fullAllele.length); // remove ref padding + } else { + indelStr = Utils.dupBytes((byte)'-', knownIndel.getReference().length() - 1); + } + int start = knownIndel.getStart() - leftmostIndex + 1; + Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); + if ( c != null ) + altConsensesToPopulate.add(c); + } + } + + private long determineReadsThatNeedCleaning(final List reads, + final ArrayList refReadsToPopulate, + final ArrayList altReadsToPopulate, + final LinkedList altAlignmentsToTest, + final Set altConsenses, + final int leftmostIndex, + final byte[] reference) { + + long totalRawMismatchSum = 0L; + for ( final GATKSAMRecord read : reads ) { + + // we can not deal with screwy records + if ( read.getCigar().numCigarElements() == 0 ) { + refReadsToPopulate.add(read); + continue; + } + + final AlignedRead aRead = new AlignedRead(read); + + // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence + int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); + if ( numBlocks == 2 ) { + Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-leftmostIndex, 0, true); + aRead.setCigar(newCigar, false); + } + + final int startOnRef = read.getAlignmentStart()-leftmostIndex; + final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); + + // if this doesn't match perfectly to the reference, let's try to clean it + if ( rawMismatchScore > 0 ) { + altReadsToPopulate.add(aRead); + //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads"); + + if ( !read.getDuplicateReadFlag() ) + totalRawMismatchSum += rawMismatchScore; + aRead.setMismatchScoreToReference(rawMismatchScore); + aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef)); + + // if it has an indel, let's see if that's the best consensus + if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && numBlocks == 2 ) { + Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases()); + if ( c != null ) + altConsenses.add(c); + } else { + altAlignmentsToTest.add(aRead); + } + } + // otherwise, we can emit it as is + else { + //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); + refReadsToPopulate.add(read); + } + } + + return totalRawMismatchSum; + } + + private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, + final Set altConsensesToPopulate, + final byte[] reference, + final int leftmostIndex) { + + // if we are under the limit, use all reads to generate alternate consenses + if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { + for ( AlignedRead aRead : altAlignmentsToTest ) { + if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); + else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } + } + // otherwise, choose reads for alternate consenses randomly + else { + int readsSeen = 0; + while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { + int index = GenomeAnalysisEngine.getRandomGenerator().nextInt(altAlignmentsToTest.size()); + AlignedRead aRead = altAlignmentsToTest.remove(index); + if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); + else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } + } + } + + private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { + + // do a pairwise alignment against the reference + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); + if ( c != null ) + altConsensesToPopulate.add(c); + } + + private void createAndAddAlternateConsensus1(AlignedRead read, final Set altConsensesToPopulate, + final byte[] reference, final int leftmostIndex) { + + for ( Consensus known : altConsensesToPopulate ) { + Pair altAlignment = findBestOffset(known.str, read, leftmostIndex); + // the mismatch score is the min of its alignment vs. the reference and vs. the alternate + int myScore = altAlignment.second; + if ( myScore == 0 ) {exactMatchesFound++; return; }// read matches perfectly to a known alt consensus - no need to run SW, we already know the answer + } + // do a pairwise alignment against the reference + SWalignmentRuns++; + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); + if ( c != null ) { + altConsensesToPopulate.add(c); + SWalignmentSuccess++; + } + } + + // create a Consensus from cigar/read strings which originate somewhere on the reference + private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) { + if ( indexOnRef < 0 ) + return null; + + // if there are no indels, we do not need this consensus, can abort early: + if ( c.numCigarElements() == 1 && c.getCigarElement(0).getOperator() == CigarOperator.M ) return null; + + // create the new consensus + ArrayList elements = new ArrayList(c.numCigarElements()-1); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < indexOnRef; i++) + sb.append((char)reference[i]); + + int indelCount = 0; + int altIdx = 0; + int refIdx = indexOnRef; + boolean ok_flag = true; + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case D: + refIdx += elementLength; + indelCount++; + elements.add(ce); + break; + case M: + case EQ: + case X: + altIdx += elementLength; + case N: + if ( reference.length < refIdx + elementLength ) + ok_flag = false; + else { + for (int j = 0; j < elementLength; j++) + sb.append((char)reference[refIdx+j]); + } + refIdx += elementLength; + elements.add(new CigarElement(elementLength, CigarOperator.M)); + break; + case I: + for (int j = 0; j < elementLength; j++) { + if ( ! BaseUtils.isRegularBase(readStr[altIdx+j]) ) { + // Insertions with N's in them cause real problems sometimes; it's better to drop them altogether + ok_flag=false; + break; + } + sb.append((char)readStr[altIdx + j]); + } + altIdx += elementLength; + indelCount++; + elements.add(ce); + break; + case S: + default: + break; + } + } + // make sure that there is at most only a single indel and it aligns appropriately! + if ( !ok_flag || indelCount != 1 || reference.length < refIdx ) + return null; + + for (int i = refIdx; i < reference.length; i++) + sb.append((char)reference[i]); + byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read + + return new Consensus(altConsensus, new Cigar(elements), indexOnRef); + } + + // create a Consensus from just the indel string that falls on the reference + private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) { + if ( indexOnRef < 0 || indexOnRef >= reference.length ) + return null; + + // create the new consensus + StringBuilder sb = new StringBuilder(); + Cigar cigar = new Cigar(); + int refIdx; + + for (refIdx = 0; refIdx < indexOnRef; refIdx++) + sb.append((char)reference[refIdx]); + if ( indexOnRef > 0 ) + cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); + + if ( indel.isSimpleDeletion() ) { + refIdx += indelStr.length; + cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); + } + else if ( indel.isSimpleInsertion() ) { + for ( byte b : indelStr ) + sb.append((char)b); + cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); + } else { + throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); + } + + if ( reference.length - refIdx > 0 ) + cigar.add(new CigarElement(reference.length - refIdx, CigarOperator.M)); + for (; refIdx < reference.length; refIdx++) + sb.append((char)reference[refIdx]); + byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read + + return new Consensus(altConsensus, cigar, 0); + } + + private Pair findBestOffset(final byte[] ref, final AlignedRead read, final int leftmostIndex) { + + // optimization: try the most likely alignment first (to get a low score to beat) + int originalAlignment = read.getOriginalAlignmentStart() - leftmostIndex; + int bestScore = mismatchQualitySumIgnoreCigar(read, ref, originalAlignment, Integer.MAX_VALUE); + int bestIndex = originalAlignment; + + // optimization: we can't get better than 0, so we can quit now + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + + // optimization: the correct alignment shouldn't be too far from the original one (or else the read wouldn't have aligned in the first place) + for ( int i = 0; i < originalAlignment; i++ ) { + int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); + if ( score < bestScore ) { + bestScore = score; + bestIndex = i; + } + // optimization: we can't get better than 0, so we can quit now + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + } + + final int maxPossibleStart = ref.length - read.getReadLength(); + for ( int i = originalAlignment + 1; i <= maxPossibleStart; i++ ) { + int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); + if ( score < bestScore ) { + bestScore = score; + bestIndex = i; + } + // optimization: we can't get better than 0, so we can quit now + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + } + + return new Pair(bestIndex, bestScore); + } + + + private boolean updateRead(final Cigar altCigar, final int altPosOnRef, final int myPosOnAlt, final AlignedRead aRead, final int leftmostIndex) { + Cigar readCigar = new Cigar(); + + // special case: there is no indel + if ( altCigar.getCigarElements().size() == 1 ) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); + readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + aRead.setCigar(readCigar); + return true; + } + + CigarElement altCE1 = altCigar.getCigarElement(0); + CigarElement altCE2 = altCigar.getCigarElement(1); + + int leadingMatchingBlockLength = 0; // length of the leading M element or 0 if the leading element is I + + CigarElement indelCE; + if ( altCE1.getOperator() == CigarOperator.I ) { + indelCE=altCE1; + if ( altCE2.getOperator() != CigarOperator.M ) { + logger.warn("When the first element of the alt consensus is I, the second one must be M. Actual: " + altCigar.toString() + ". Skipping this site..."); + return false; + } + } + else { + if ( altCE1.getOperator() != CigarOperator.M ) { + logger.warn("First element of the alt consensus cigar must be M or I. Actual: " + altCigar.toString() + ". Skipping this site..."); + return false; + } + if ( altCE2.getOperator() == CigarOperator.I || altCE2.getOperator() == CigarOperator.D ) { + indelCE=altCE2; + } else { + logger.warn("When first element of the alt consensus is M, the second one must be I or D. Actual: " + altCigar.toString() + ". Skipping this site..."); + return false; + } + leadingMatchingBlockLength = altCE1.getLength(); + } + + // the easiest thing to do is to take each case separately + int endOfFirstBlock = altPosOnRef + leadingMatchingBlockLength; + boolean sawAlignmentStart = false; + + // for reads starting before the indel + if ( myPosOnAlt < endOfFirstBlock) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); + sawAlignmentStart = true; + + // for reads ending before the indel + if ( myPosOnAlt + aRead.getReadLength() <= endOfFirstBlock) { + //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + //aRead.setCigar(readCigar); + aRead.setCigar(null); // reset to original alignment + return true; + } + readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M)); + } + + // forward along the indel + //int indelOffsetOnRef = 0, indelOffsetOnRead = 0; + if ( indelCE.getOperator() == CigarOperator.I ) { + // for reads that end in an insertion + if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + indelCE.getLength() ) { + int partialInsertionLength = myPosOnAlt + aRead.getReadLength() - endOfFirstBlock; + // if we also started inside the insertion, then we need to modify the length + if ( !sawAlignmentStart ) + partialInsertionLength = aRead.getReadLength(); + readCigar.add(new CigarElement(partialInsertionLength, CigarOperator.I)); + aRead.setCigar(readCigar); + return true; + } + + // for reads that start in an insertion + if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + indelCE.getLength() ) { + aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock); + readCigar.add(new CigarElement(indelCE.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I)); + //indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; + sawAlignmentStart = true; + } else if ( sawAlignmentStart ) { + readCigar.add(indelCE); + //indelOffsetOnRead = indelCE.getLength(); + } + } else if ( indelCE.getOperator() == CigarOperator.D ) { + if ( sawAlignmentStart ) + readCigar.add(indelCE); + //indelOffsetOnRef = indelCE.getLength(); + } + + // for reads that start after the indel + if ( !sawAlignmentStart ) { + //aRead.setAlignmentStart(leftmostIndex + myPosOnAlt + indelOffsetOnRef - indelOffsetOnRead); + //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + //aRead.setCigar(readCigar); + aRead.setCigar(null); // reset to original alignment + return true; + } + + int readRemaining = aRead.getReadBases().length; + for ( CigarElement ce : readCigar.getCigarElements() ) { + if ( ce.getOperator() != CigarOperator.D ) + readRemaining -= ce.getLength(); + } + if ( readRemaining > 0 ) + readCigar.add(new CigarElement(readRemaining, CigarOperator.M)); + aRead.setCigar(readCigar); + + return true; + } + + private boolean alternateReducesEntropy(final List reads, final byte[] reference, final int leftmostIndex) { + final int[] originalMismatchBases = new int[reference.length]; + final int[] cleanedMismatchBases = new int[reference.length]; + final int[] totalOriginalBases = new int[reference.length]; + final int[] totalCleanedBases = new int[reference.length]; + + // set to 1 to prevent dividing by zero + for ( int i=0; i < reference.length; i++ ) + originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 0; + + for (final AlignedRead read : reads) { + if (read.getRead().getAlignmentBlocks().size() > 1) + continue; + + int refIdx = read.getOriginalAlignmentStart() - leftmostIndex; + final byte[] readStr = read.getReadBases(); + final byte[] quals = read.getBaseQualities(); + + for (int j = 0; j < readStr.length; j++, refIdx++) { + if (refIdx < 0 || refIdx >= reference.length) { + //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() ); + //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); + break; + } + totalOriginalBases[refIdx] += quals[j]; + if (readStr[j] != reference[refIdx]) + originalMismatchBases[refIdx] += quals[j]; + } + + // reset and now do the calculation based on the cleaning + refIdx = read.getAlignmentStart() - leftmostIndex; + int altIdx = 0; + Cigar c = read.getCigar(); + for (int j = 0; j < c.numCigarElements(); j++) { + CigarElement ce = c.getCigarElement(j); + int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + for (int k = 0; k < elementLength; k++, refIdx++, altIdx++) { + if (refIdx >= reference.length) + break; + totalCleanedBases[refIdx] += quals[altIdx]; + if (readStr[altIdx] != reference[refIdx]) + cleanedMismatchBases[refIdx] += quals[altIdx]; + } + break; + case I: + altIdx += elementLength; + break; + case D: + refIdx += elementLength; + break; + case S: + default: + break; + } + } + } + + int originalMismatchColumns = 0, cleanedMismatchColumns = 0; + StringBuilder sb = new StringBuilder(); + for ( int i=0; i < reference.length; i++ ) { + if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) + continue; + boolean didMismatch = false, stillMismatches = false; + if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD ) { + didMismatch = true; + originalMismatchColumns++; + if ( totalCleanedBases[i] > 0 && ((double)cleanedMismatchBases[i] / (double)totalCleanedBases[i]) > ((double)originalMismatchBases[i] / (double)totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { + stillMismatches = true; + cleanedMismatchColumns++; + } + } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) { + cleanedMismatchColumns++; + } + if ( snpsOutput != null ) { + if ( didMismatch ) { + sb.append(reads.get(0).getRead().getReferenceName()).append(":").append(leftmostIndex + i); + if ( stillMismatches ) + sb.append(" SAME_SNP\n"); + else + sb.append(" NOT_SNP\n"); + } + } + } + + //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); + + final boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); + if ( reduces && snpsOutput != null ) { + try { + snpsOutput.write(sb.toString()); + snpsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("snpsOutput", "Failed to write SNPs output file", e); + } + } + return reduces; + } + + protected static Cigar unclipCigar(Cigar cigar) { + ArrayList elements = new ArrayList(cigar.numCigarElements()); + for ( CigarElement ce : cigar.getCigarElements() ) { + if ( !isClipOperator(ce.getOperator()) ) + elements.add(ce); + } + return new Cigar(elements); + } + + private static boolean isClipOperator(CigarOperator op) { + return op == CigarOperator.S || op == CigarOperator.H || op == CigarOperator.P; + } + + protected static Cigar reclipCigar(Cigar cigar, SAMRecord read) { + ArrayList elements = new ArrayList(); + + int i = 0; + int n = read.getCigar().numCigarElements(); + while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) + elements.add(read.getCigar().getCigarElement(i++)); + + elements.addAll(cigar.getCigarElements()); + + i++; + while ( i < n && !isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) + i++; + + while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) + elements.add(read.getCigar().getCigarElement(i++)); + + return new Cigar(elements); + } + + private class AlignedRead { + private final GATKSAMRecord read; + private byte[] readBases = null; + private byte[] baseQuals = null; + private Cigar newCigar = null; + private int newStart = -1; + private int mismatchScoreToReference = 0; + private long alignerMismatchScore = 0; + + public AlignedRead(GATKSAMRecord read) { + this.read = read; + mismatchScoreToReference = 0; + } + + public GATKSAMRecord getRead() { + return read; + } + + public int getReadLength() { + return readBases != null ? readBases.length : read.getReadLength(); + } + + public byte[] getReadBases() { + if ( readBases == null ) + getUnclippedBases(); + return readBases; + } + + public byte[] getBaseQualities() { + if ( baseQuals == null ) + getUnclippedBases(); + return baseQuals; + } + + // pull out the bases that aren't clipped out + private void getUnclippedBases() { + readBases = new byte[getReadLength()]; + baseQuals = new byte[getReadLength()]; + byte[] actualReadBases = read.getReadBases(); + byte[] actualBaseQuals = read.getBaseQualities(); + int fromIndex = 0, toIndex = 0; + + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + int elementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case S: + fromIndex += elementLength; + break; + case M: + case EQ: + case X: + case I: + System.arraycopy(actualReadBases, fromIndex, readBases, toIndex, elementLength); + System.arraycopy(actualBaseQuals, fromIndex, baseQuals, toIndex, elementLength); + fromIndex += elementLength; + toIndex += elementLength; + default: + break; + } + } + + // if we got clipped, trim the array + if ( fromIndex != toIndex ) { + byte[] trimmedRB = new byte[toIndex]; + byte[] trimmedBQ = new byte[toIndex]; + System.arraycopy(readBases, 0, trimmedRB, 0, toIndex); + System.arraycopy(baseQuals, 0, trimmedBQ, 0, toIndex); + readBases = trimmedRB; + baseQuals = trimmedBQ; + } + } + + public Cigar getCigar() { + return (newCigar != null ? newCigar : read.getCigar()); + } + + public void setCigar(Cigar cigar) { + setCigar(cigar, true); + } + + // tentatively sets the new Cigar, but it needs to be confirmed later + public void setCigar(Cigar cigar, boolean fixClippedCigar) { + if ( cigar == null ) { + newCigar = null; + return; + } + + if ( fixClippedCigar && getReadBases().length < read.getReadLength() ) + cigar = reclipCigar(cigar); + + // no change? + if ( read.getCigar().equals(cigar) ) { + newCigar = null; + return; + } + + // no indel? + String str = cigar.toString(); + if ( !str.contains("D") && !str.contains("I") ) { + logger.debug("Modifying a read with no associated indel; although this is possible, it is highly unlikely. Perhaps this region should be double-checked: " + read.getReadName() + " near " + read.getReferenceName() + ":" + read.getAlignmentStart()); + // newCigar = null; + // return; + } + + newCigar = cigar; + } + + // pull out the bases that aren't clipped out + private Cigar reclipCigar(Cigar cigar) { + return IndelRealigner.reclipCigar(cigar, read); + } + + // tentatively sets the new start, but it needs to be confirmed later + public void setAlignmentStart(int start) { + newStart = start; + } + + public int getAlignmentStart() { + return (newStart != -1 ? newStart : read.getAlignmentStart()); + } + + public int getOriginalAlignmentStart() { + return read.getAlignmentStart(); + } + + // finalizes the changes made. + // returns true if this record actually changes, false otherwise + public boolean finalizeUpdate() { + // if we haven't made any changes, don't do anything + if ( newCigar == null ) + return false; + if ( newStart == -1 ) + newStart = read.getAlignmentStart(); + else if ( Math.abs(newStart - read.getAlignmentStart()) > MAX_POS_MOVE_ALLOWED ) { + logger.debug(String.format("Attempting to realign read %s at %d more than %d bases to %d.", read.getReadName(), read.getAlignmentStart(), MAX_POS_MOVE_ALLOWED, newStart)); + return false; + } + + // store the old CIGAR and start in case we need to back out + final Cigar oldCigar = read.getCigar(); + final int oldStart = read.getAlignmentStart(); + + // try updating the read with the new CIGAR and start + read.setCigar(newCigar); + read.setAlignmentStart(newStart); + + // back out if necessary + if ( realignmentProducesBadAlignment(read) ) { + read.setCigar(oldCigar); + read.setAlignmentStart(oldStart); + return false; + } + + // annotate the record with the original cigar and start (if it changed) + if ( !NO_ORIGINAL_ALIGNMENT_TAGS ) { + read.setAttribute(ORIGINAL_CIGAR_TAG, oldCigar.toString()); + if ( newStart != oldStart ) + read.setAttribute(ORIGINAL_POSITION_TAG, oldStart); + } + + return true; + } + + public void setMismatchScoreToReference(int score) { + mismatchScoreToReference = score; + } + + public int getMismatchScoreToReference() { + return mismatchScoreToReference; + } + + public void setAlignerMismatchScore(long score) { + alignerMismatchScore = score; + } + + public long getAlignerMismatchScore() { + return alignerMismatchScore; + } + } + + /** + * Determines whether the read aligns off the end of the contig + * + * @param read the read to check + * @return true if it aligns off the end + */ + private boolean realignmentProducesBadAlignment(final GATKSAMRecord read) { + final int contigLength = referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength(); + return realignmentProducesBadAlignment(read, contigLength); + } + + /** + * Determines whether the read aligns off the end of the contig. + * Pulled out to make it testable. + * + * @param read the read to check + * @return true if it aligns off the end + */ + protected static boolean realignmentProducesBadAlignment(final GATKSAMRecord read, final int contigLength) { + return read.getAlignmentEnd() > contigLength; + } + + private static class Consensus { + public final byte[] str; + public final ArrayList> readIndexes; + public final int positionOnReference; + public int mismatchSum; + public Cigar cigar; + + public Consensus(byte[] str, Cigar cigar, int positionOnReference) { + this.str = str; + this.cigar = cigar; + this.positionOnReference = positionOnReference; + mismatchSum = 0; + readIndexes = new ArrayList>(); + } + + @Override + public boolean equals(Object o) { + return ( this == o || (o instanceof Consensus && Arrays.equals(this.str,(((Consensus)o).str)) ) ); + } + + public boolean equals(Consensus c) { + return ( this == c || Arrays.equals(this.str,c.str) ) ; + } + + @Override + public int hashCode() { + return Arrays.hashCode(this.str); + } + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java new file mode 100644 index 000000000..aa8b46312 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -0,0 +1,530 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; +import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; +import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.Map; + + +public class PairHMMIndelErrorModel { + public static final int BASE_QUAL_THRESHOLD = 20; + + private boolean DEBUG = false; + + private static final int MAX_CACHED_QUAL = 127; + + private static final double baseMatchArray[]; + private static final double baseMismatchArray[]; + + private static final int START_HRUN_GAP_IDX = 4; + private static final int MAX_HRUN_GAP_IDX = 20; + + private static final byte MIN_GAP_OPEN_PENALTY = 30; + private static final byte MIN_GAP_CONT_PENALTY = 10; + private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. + + private final byte[] GAP_OPEN_PROB_TABLE; + private final byte[] GAP_CONT_PROB_TABLE; + + private final PairHMM pairHMM; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + + static { + baseMatchArray = new double[MAX_CACHED_QUAL+1]; + baseMismatchArray = new double[MAX_CACHED_QUAL+1]; + for (int k=1; k <= MAX_CACHED_QUAL; k++) { + double baseProb = Math.pow(10, -k/10.); + + + baseMatchArray[k] = Math.log10(1-baseProb); + baseMismatchArray[k] = Math.log10(baseProb); + } + } + + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + this.DEBUG = deb; + + switch (hmmType) { + case EXACT: + pairHMM = new Log10PairHMM(true); + break; + case ORIGINAL: + pairHMM = new Log10PairHMM(false); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessPairHMM(); + break; + case ARRAY_LOGLESS: + pairHMM = new ArrayLoglessPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS."); + } + + // fill gap penalty table, affine naive model: + this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + + for (int i = 0; i < START_HRUN_GAP_IDX; i++) { + GAP_OPEN_PROB_TABLE[i] = indelGOP; + GAP_CONT_PROB_TABLE[i] = indelGCP; + } + + double step = GAP_PENALTY_HRUN_STEP/10.0; + + // initialize gop and gcp to their default values + byte gop = indelGOP; + byte gcp = indelGCP; + + // all of the following is computed in QUal-space + for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { + gop -= GAP_PENALTY_HRUN_STEP; + if (gop < MIN_GAP_OPEN_PENALTY) + gop = MIN_GAP_OPEN_PENALTY; + + gcp -= step; + if(gcp < MIN_GAP_CONT_PENALTY) + gcp = MIN_GAP_CONT_PENALTY; + GAP_OPEN_PROB_TABLE[i] = gop; + GAP_CONT_PROB_TABLE[i] = gcp; + } + + } + + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { + // compute forward hrun length, example: + // AGGTGACCCCCCTGAGAG + // 001000012345000000 + hrunArray[0] = 0; + int[] hforward = new int[hrunArray.length]; + int[] hreverse = new int[hrunArray.length]; + + for (int i = 1; i < refBytes.length; i++) { + if (refBytes[i] == refBytes[i-1]) + hforward[i] = hforward[i-1]+1; + else + hforward[i] = 0; + } + + // do similar thing for reverse length, example: + // AGGTGACCCCCCTGAGAG + // 021000543210000000 + // and then accumulate with forward values. + // Total: + // AGGTGACCCCCCTGAGAG + // 022000555555000000 + for (int i=refBytes.length-1; i > 0; i--) { + if (refBytes[i-1] == refBytes[i]) + hreverse[i-1] += hreverse[i]+1; + } + + for (int i = 1; i < refBytes.length; i++) + hrunArray[i] = hforward[i]+hreverse[i]; + } + + + private void fillGapProbabilities(final int[] hrunProfile, + final byte[] contextLogGapOpenProbabilities, + final byte[] contextLogGapContinuationProbabilities) { + // fill based on lookup table + for (int i = 0; i < hrunProfile.length; i++) { + if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { + contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; + contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; + } + else { + contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]]; + contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]]; + } + } + } + + /** + * Trims the haplotypes in the given map to the provided start/stop. + * + * @param haplotypeMap the input map + * @param startLocationInRefForHaplotypes the start location of the trim + * @param stopLocationInRefForHaplotypes the stop location of the trim + * @param ref the reference context (used for debugging only, so can be null) + * @return a non-null mapping corresponding to the trimmed version of the original; + * some elements may be lost if trimming cannot be performed on them (e.g. they fall outside of the region to keep) + */ + protected static Map trimHaplotypes(final Map haplotypeMap, + long startLocationInRefForHaplotypes, + long stopLocationInRefForHaplotypes, + final ReferenceContext ref) { + if ( haplotypeMap == null ) throw new IllegalArgumentException("The input allele to haplotype map cannot be null"); + + final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); + for (final Allele a: haplotypeMap.keySet()) { + + final Haplotype haplotype = haplotypeMap.get(a); + + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); + if ( indStart >= indStop ) + continue; + + // commented out here because we need to make this method static for unit testing + //if (DEBUG) + // System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", + // indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map + final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); + trimmedHaplotypeMap.put(a, trimmedHaplotype); + } + return trimmedHaplotypeMap; + } + + + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, + final double downsamplingFraction) { + final int numHaplotypes = haplotypeMap.size(); + + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); + return getDiploidHaplotypeLikelihoods(numHaplotypes, readLikelihoods); + + } + + /** + * Should we clip a downstream portion of a read because it spans off the end of a haplotype? + * + * @param read the read in question + * @param refWindowStop the end of the reference window + * @return true if the read needs to be clipped, false otherwise + */ + protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) { + return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() - 1 > refWindowStop ); + } + + /** + * Should we clip a upstream portion of a read because it spans off the end of a haplotype? + * + * @param read the read in question + * @param refWindowStart the start of the reference window + * @return true if the read needs to be clipped, false otherwise + */ + protected static boolean mustClipUpstream(final GATKSAMRecord read, final int refWindowStart) { + return ( !read.isEmpty() && read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart ); + } + + @Ensures("result != null && result.length == pileup.getNumberOfElements()") + public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { + final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; + + final LinkedList readList = new LinkedList<>(); + final Map readGCPArrayMap = new LinkedHashMap<>(); + int readIdx=0; + for (PileupElement p: pileup) { + + // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) + if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { + Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); + int j=0; + for (Allele a: haplotypeMap.keySet()) { + readLikelihoods[readIdx][j++] = el.get(a); + } + } + else { + // extra padding on candidate haplotypes to make sure reads are always strictly contained + // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly + // in case bases at edge of reads have lower quality. + final int trailingBases = 3; + final int refWindowStart = ref.getWindow().getStart() + trailingBases; + final int refWindowStop = ref.getWindow().getStop() - trailingBases; + + if (DEBUG) { + System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); + } + + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + + // if the read extends beyond the downstream (right) end of the reference window, clip it + if ( mustClipDownstream(read, refWindowStop) ) + read = ReadClipper.hardClipByReadCoordinates(read, refWindowStop - read.getSoftStart() + 1, read.getReadLength() - 1); + + // if the read extends beyond the upstream (left) end of the reference window, clip it + if ( mustClipUpstream(read, refWindowStart) ) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart); + + if (read.isEmpty()) + continue; + + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string + read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD ); + + if (read.isEmpty()) + continue; + + // get bases of candidate haplotypes that overlap with reads + final long readStart = read.getSoftStart(); + final long readEnd = read.getSoftEnd(); + + // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match, + // but they're actually consistent with the insertion! + // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning. + // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read. + final long eventStartPos = ref.getLocus().getStart(); + + // compute total number of clipped bases (soft or hard clipped) and only use them if necessary + final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength); + final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0; + final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ; + final byte [] unclippedReadBases = read.getReadBases(); + final byte [] unclippedReadQuals = read.getBaseQualities(); + + /** + * Compute genomic locations that candidate haplotypes will span. + * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, + * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. + * We will propose haplotypes that overlap the read with some padding. + * True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read) + * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. + * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to + * differentiate context between two haplotypes + */ + final int absEventLength = Math.abs(eventLength); + long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0); + long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength; + + if (DEBUG) + System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; + + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { + startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; + } + else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { + startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; + } + + // candidate haplotype cannot go beyond reference context + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context + } + + if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap + } + + // ok, we now figured out the total number of clipped bases on both ends. + // Figure out where we want to place the haplotype to score read against + + if (DEBUG) + System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); + + // LinkedHashMap readEl = new LinkedHashMap(); + + /** + * Check if we'll end up with an empty read once all clipping is done + */ + if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { + int j=0; + for (Allele a: haplotypeMap.keySet()) { + perReadAlleleLikelihoodMap.add(p,a,0.0); + readLikelihoods[readIdx][j++] = 0.0; + } + } + else { + final int endOfCopy = unclippedReadBases.length - numEndSoftClippedBases; + final byte[] readBases = Arrays.copyOfRange(unclippedReadBases, numStartSoftClippedBases, endOfCopy); + final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals, numStartSoftClippedBases, endOfCopy); + + int j=0; + + final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; + final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; + + // get homopolymer length profile for current haplotype + final int[] hrunProfile = new int[readBases.length]; + getContextHomopolymerLength(readBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + // get the base insertion and deletion qualities to use + final byte[] baseInsertionQualities, baseDeletionQualities; + if ( read.hasBaseIndelQualities() ) { + baseInsertionQualities = Arrays.copyOfRange(read.getBaseInsertionQualities(), numStartSoftClippedBases, endOfCopy); + baseDeletionQualities = Arrays.copyOfRange(read.getBaseDeletionQualities(), numStartSoftClippedBases, endOfCopy); + } else { + baseInsertionQualities = contextLogGapOpenProbabilities; + baseDeletionQualities = contextLogGapOpenProbabilities; + } + + // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); + readList.add(processedRead); + + // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM + readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); + + // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations + final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); + + // Get the likelihoods for our clipped read against each of our trimmed haplotypes. + final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); + + // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array + for (Allele a: trimmedHaplotypeMap.keySet()){ + double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); + readLikelihoods[readIdx][j++] = readLikelihood; + } + // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually + readList.remove(processedRead); + + // The same is true for the read/GCP-array map + readGCPArrayMap.remove(processedRead); + } + } + readIdx++; + } + + if (DEBUG) { + System.out.println("\nLikelihood summary"); + for (readIdx=0; readIdx < pileup.getNumberOfElements(); readIdx++) { + System.out.format("Read Index: %d ",readIdx); + for (int i=0; i < readLikelihoods[readIdx].length; i++) + System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); + System.out.println(); + } + + } + + return readLikelihoods; + } + + private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { + return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); + } + +// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { +// if (b1.length != b2.length) +// return 0; // sanity check +// +// for (int i=0; i < b1.length; i++ ){ +// if ( b1[i]!= b2[i] ) +// return i; +// } +// return b1.length; +// } + + private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final double readLikelihoods[][]) { + final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; + + // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix + for (int i=0; i < numHaplotypes; i++) { + for (int j=i; j < numHaplotypes; j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) + //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) + for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j])) + continue; + final double li = readLikelihoods[readIdx][i]; + final double lj = readLikelihoods[readIdx][j]; + haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF; + } + } + } + + final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; + int k=0; + for (int j=0; j < numHaplotypes; j++) { + for (int i=0; i <= j; i++){ + genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; + } + } + + // renormalize so that max element is zero. + return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java new file mode 100644 index 000000000..b39aa1b42 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -0,0 +1,112 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +class Haplotype extends BaseArray implements Cloneable { + public Haplotype(byte[] bases) { + super(bases); + } + + private Haplotype(Byte[] bases) { + super(bases); + } + + public Haplotype(Haplotype other) { + super(other); + } + + public Haplotype(BaseArray baseArr) { + super(baseArr.bases); + + if (baseArr.getNonNullIndices().length != baseArr.bases.length) + throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!"); + } + + public void updateBase(int index, Byte base) { + if (base == null) { + throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!"); + } + super.updateBase(index, base); + } + + public Haplotype clone() { + try { + super.clone(); + } catch (CloneNotSupportedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return new Haplotype(this); + } + + // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. + + public Haplotype subHaplotype(int fromIndex, int toIndex) { + return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); + } + + public Haplotype subHaplotype(Set inds) { + List basesList = new LinkedList(); + for (int i : inds) { + if (0 <= i && i < bases.length) + basesList.add(bases[i]); + } + + Byte[] newBases = new Byte[basesList.size()]; + int index = 0; + for (Byte b : basesList) + newBases[index++] = b; + + return new Haplotype(newBases); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java new file mode 100644 index 000000000..707bf2722 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -0,0 +1,998 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.*; + +import java.io.PrintStream; +import java.util.*; + +/** + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +public class PhaseByTransmission extends RodWalker, HashMap> { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; + + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; + + @Argument(shortName = "fatherAlleleFirst",required = false,fullName = "FatherAlleleFirst", doc="Ouputs the father allele as the first allele in phased child genotype. i.e. father|mother rather than mother|father.") + private boolean fatherFAlleleFirst=false; + + @Output + protected VariantContextWriter vcfWriter = null; + + private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; + private final String SOURCE_NAME = "PhaseByTransmission"; + + public final double NO_TRANSMISSION_PROB = -1.0; + + private ArrayList trios = new ArrayList(); + + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; + + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; + + //Metrics counters hash keys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; + + //Random number generator + private Random rand = new Random(); + + private enum FamilyMember { + MOTHER, + FATHER, + CHILD + } + + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. + private class TrioPhase { + + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; + + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); + + private ArrayList getAlleles(GenotypeType genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == GenotypeType.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == GenotypeType.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == GenotypeType.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else{ + return null; + } + return alleles; + } + + private boolean isPhasable(GenotypeType genotype){ + return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR; + } + + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){ + boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR; + trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase)); + } + + private Genotype makeGenotype(final GenotypeType type, boolean phase) { + return makeGenotype(getAlleles(type), phase); + } + + private Genotype makeGenotype(final List alleles, boolean phase) { + final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles); + gb.phased(phase); + return gb.make(); + } + + //Find the phase for a parent/child pair + private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){ + + //Special case for Het/Het as it is ambiguous + if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){ + trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); + return; + } + + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the parent and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + if(parent.equals(FamilyMember.MOTHER)) + childPhasedAlleles.add(childAlleles.get(0)); + else + childPhasedAlleles.add(0,childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + if(parent.equals(FamilyMember.MOTHER)) + childPhasedAlleles.add(childAlleles.get(0)); + else + childPhasedAlleles.add(0,childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); + } + } + + //Phases a family by transmission + private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){ + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } + } + + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true)); + + //Once a phased combination is found; exit + return; + } + } + + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false)); + } + + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){ + + //Take care of cases where one or more family members are no call + if(!isPhasable(child)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else if(!isPhasable(mother)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(!isPhasable(father)){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(!isPhasable(father)){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } + + //If child should phased genotype should be father first, then swap the alleles + if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){ + ArrayList childAlleles = new ArrayList(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles()); + childAlleles.add(childAlleles.remove(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true)); + } + + } + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB){ + double dphredScoreTransmission = QualityUtils.phredScaleLog10ErrorRate(Math.log10(1 - (transmissionProb))); + phredScoreTransmission = dphredScoreTransmission < Byte.MAX_VALUE ? (byte)dphredScoreTransmission : Byte.MAX_VALUE; + } + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) + return genotype; + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getExtendedAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); + + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double log10Error; + if(genotype.getType() == phasedGenotype.getType()) + log10Error = genotype.getLog10PError(); + else + log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); + + return new GenotypeBuilder(genotype).alleles(phasedAlleles) + .log10PError(log10Error) + .attributes(genotypeAttributes) + .phased(phasedGenotype.isPhased()).make(); + } + + + } + + /** + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer + */ + public void initialize() { + ArrayList rodNames = new ArrayList(); + rodNames.add(variantCollection.variants.getName()); + Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + + //Get the trios from the families passed as ped + setTrios(vcfSamples); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no *non-skipped* trios found in PED file. Aborted."); + + + Set headerLines = new HashSet(); + headerLines.addAll(GATKVCFUtils.getHeaderFields(this.getToolkit())); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); + headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + + buildMatrices(); + + if(mvFile != null) + mvFile.println("CHROM\tPOS\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_AD\tMOTHER_PL\tFATHER_GT\tFATHER_DP\tFATHER_AD\tFATHER_PL\tCHILD_GT\tCHILD_DP\tCHILD_AD\tCHILD_PL"); + + } + + /** + * Select trios and parent/child pairs only + */ + private void setTrios(Set vcfSamples){ + + Map> families = this.getSampleDB().getFamilies(vcfSamples); + Set family; + ArrayList parents; + for(Map.Entry> familyEntry : families.entrySet()){ + family = familyEntry.getValue(); + + // Since getFamilies(vcfSamples) above still returns parents of samples in the VCF even if those parents are not in the VCF, need to subset down here: + Set familyMembersInVCF = new TreeSet(); + for(Sample familyMember : family){ + if (vcfSamples.contains(familyMember.getID())) { + familyMembersInVCF.add(familyMember); + } + } + family = familyMembersInVCF; + + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey(),family.size())); + } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Child %s of family %s skipped as info is not provided as a complete trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Child skipped.", familyMember.getID(), familyEntry.getKey())); + } + } + } + + } + + + + } + + //Create the transmission matrices + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(GenotypeType.class); + transmissionMatrix = new EnumMap>>(GenotypeType.class); + for(GenotypeType mother : GenotypeType.values()){ + mvCountMatrix.put(mother,new EnumMap>(GenotypeType.class)); + transmissionMatrix.put(mother,new EnumMap>(GenotypeType.class)); + for(GenotypeType father : GenotypeType.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); + for(GenotypeType child : GenotypeType.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } + } + + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. + private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){ + + //Child is no call => No MV + if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE)) + parents.add(mother); + if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(GenotypeType parent : parents){ + if(parent == GenotypeType.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == GenotypeType.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == GenotypeType.HOM_VAR){ + parentsNumAltAlleles++; + } + } + + //Case Child is HomRef + if(child == GenotypeType.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == GenotypeType.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; + } + + //Case child is Het + if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; + + //MV + return 1; + } + + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. + private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; + } + + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ + EnumMap likelihoods = new EnumMap(GenotypeType.class); + likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); + likelihoods.put(GenotypeType.HET,1.0/3.0); + likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } + + //Returns the GenotypeType; returns UNVAILABLE if given null + private GenotypeType getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return GenotypeType.UNAVAILABLE; + return genotype.getType(); + } + + + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); + GenotypeType pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); + pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; + } + } + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + int configuration_index =0; + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + //Get the most likely combination + //Only check for most likely combination if at least a parent and the child have genotypes + if(child.isCalled() && parentsCalled > 0){ + int mvCount; + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); + bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(cumulativeMVCount/3); + } + configurationLikelihood = 0; + } + } + } + + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + + //In case of multiple equally likely combinations, take a random one + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); + } + + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; + } + + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return bestMVCount.get(configuration_index); + + } + + + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); + } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); + } + + } + + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + + } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); + } + } + + /** + * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype + * + * @param tracker the reference meta-data tracker + * @param ref the reference context + * @param context the alignment context + * @return null + */ + @Override + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + String mvfLine; + + if (tracker == null) + return metricsCounters; + + final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); + if ( vc == null ) + return metricsCounters; + + if ( !vc.isBiallelic() ) { + vcfWriter.add(vc); + return metricsCounters; + } + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + + final GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); + + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) + continue; + + ArrayList trioGenotypes = new ArrayList(3); + final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); + + Genotype phasedMother = trioGenotypes.get(0); + Genotype phasedFather = trioGenotypes.get(1); + Genotype phasedChild = trioGenotypes.get(2); + + //Fill the genotype map with the new genotypes and increment metrics counters + genotypesContext.replace(phasedChild); + if(mother != null){ + genotypesContext.replace(phasedMother); + if(father != null){ + genotypesContext.replace(phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), + phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()), + phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + } + else{ + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s:%s:%s:%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s", + vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), + phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); + } + } + else{ + genotypesContext.replace(phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), + phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); + } + + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(mvCount>0 && mvFile != null && !vc.isFiltered()) + mvFile.println(mvfLine); + } + + builder.genotypes(genotypesContext); + vcfWriter.add(builder.make()); + + return metricsCounters; + } + + private static String printAD(final int[] AD) { + if ( AD == null || AD.length == 0 ) + return "."; + final StringBuilder sb = new StringBuilder(); + sb.append(AD[0]); + for ( int i = 1; i < AD.length; i++) { + sb.append(","); + sb.append(AD[i]); + } + return sb.toString(); + } + + /** + * Initializes the reporting counters. + * + * @return All counters initialized to 0 + */ + @Override + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + return metricsCounters; + } + + /** + * Adds the value of the site phased to the reporting counters. + * + * @param value Site values + * @param sum accumulator for the reporting counters + * @return accumulator with result of the map taken into account. + */ + @Override + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + + return sum; + } + + + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java new file mode 100644 index 000000000..7ed77b845 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -0,0 +1,1870 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.io.*; +import java.util.*; + +import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersFromRods; + +/** + * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * + * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites. + * + * The underlying algorithm is based on building up 2^n local haplotypes, + * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). + * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag). + * + *

+ * Performs physical phasing of SNP calls, based on sequencing reads. + *

+ * + *

Input

+ *

+ * VCF file of SNP calls, BAM file of sequence reads. + *

+ * + *

Output

+ *

+ * Phased VCF file. + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadBackedPhasing
+ *      -R reference.fasta
+ *      -I reads.bam
+ *      --variant SNPs.vcf
+ *      -L SNPs.vcf
+ *      -o phased_SNPs.vcf
+ *      --phaseQualityThresh 20.0
+ * 
+ * + * @author Menachem Fromer + * @since July 2010 + */ +@Allows(value = {DataSource.READS, DataSource.REFERENCE}) +@Requires(value = {DataSource.READS, DataSource.REFERENCE}) +@By(DataSource.READS) + +// Filter out all reads with zero mapping quality +@ReadFilters({MappingQualityZeroFilter.class}) + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +public class ReadBackedPhasing extends RodWalker { + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) + protected boolean DEBUG = false; + /** + * The VCF file we are phasing variants from. + * + * All heterozygous variants found in this VCF file will be phased, where possible + */ + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc = "File to which variants should be written") + protected VariantContextWriter writer = null; + + @Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false) + protected Integer cacheWindow = 20000; + + @Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false) + protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes + + @Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false) + protected Double phaseQualityThresh = 20.0; // PQ = 20.0 <=> P(error) = 10^(-20/10) = 0.01, P(correct) = 0.99 + + @Hidden + @Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files [For DEBUGGING purposes only - DO NOT USE!]", required = false) + protected String variantStatsFilePrefix = null; + private PhasingQualityStatsWriter statsWriter = null; + + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false) + public int MIN_BASE_QUALITY_SCORE = 17; + + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false) + public int MIN_MAPPING_QUALITY_SCORE = 20; + + @Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false) + protected Set samplesToPhase = null; + + @Hidden + @Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false) + private boolean permitNoSampleOverlap = false; + + private GenomeLoc mostDownstreamLocusReached = null; + + private LinkedList unphasedSiteQueue = null; + private CloneableIteratorLinkedList partiallyPhasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions + + private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0); + + public static final String PQ_KEY = "PQ"; + public static final String HP_KEY = "HP"; + + // In order to detect phase inconsistencies: + private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads + private static final double MAX_FRACTION_OF_INCONSISTENT_READS = 0.1; // If there are more than this fraction of inconsistent reads, then flag this site + + public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent"; + + @Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false) + protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false; + + @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false) + protected int maxGenomicDistanceForMNP = 1; + + @Hidden + @Argument(fullName = "outputMultipleBaseCountsFile", shortName = "outputMultipleBaseCountsFile", doc = "File to output cases where a single read has multiple bases at the same position [For DEBUGGING purposes only - DO NOT USE!]", required = false) + protected File outputMultipleBaseCountsFile = null; + private MultipleBaseCountsWriter outputMultipleBaseCountsWriter = null; + + public void initialize() { + if (maxPhaseSites <= 2) + maxPhaseSites = 2; // by definition, must phase a site relative to previous site [thus, 2 in total] + + /* + Since we cap each base quality (BQ) by its read's mapping quality (MQ) [in Read.updateBaseAndQuality()], then: + if minBQ > minMQ, then we require that MQ be >= minBQ as well. + [Otherwise, we end up capping BQ by MQ only AFTER we tried removing bases with BQ < minBQ, which is WRONG!] + + To do this properly, we set: minMQ = max(minMQ, minBQ) + */ + MIN_MAPPING_QUALITY_SCORE = Math.max(MIN_MAPPING_QUALITY_SCORE, MIN_BASE_QUALITY_SCORE); + + unphasedSiteQueue = new LinkedList(); + partiallyPhasedSites = new CloneableIteratorLinkedList(); + + initializeVcfWriter(); + + if (variantStatsFilePrefix != null) + statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix); + + if (outputMultipleBaseCountsFile != null) + outputMultipleBaseCountsWriter = new MultipleBaseCountsWriter(outputMultipleBaseCountsFile); + } + + private void initializeVcfWriter() { + // Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here] + VariantContextWriter origWriter = writer; + + if (enableMergePhasedSegregatingPolymorphismsToMNP) + writer = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter); + + /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow + Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow + Hence, the startDistance between: partiallyPhasedSites.peek() --> mostDownstreamLocusReached is <= 2 * cacheWindow + + Therefore, can write the filtered records located at mostDownstreamLocusReached (if any) to SortingVCFWriter, even though partiallyPhasedSites.peek() has not yet been written. + + But, NOTE that map() is careful to pass out a list of records to be written that FIRST includes any records discarded due to having reached mostDownstreamLocusReached, + and only THEN records located at mostDownstreamLocusReached. The opposite order in map() would violate the startDistance limits imposed when contracting SortingVCFWriter with (2 * cacheWindow). + */ + writer = VariantContextWriterFactory.sortOnTheFly(writer, 2 * cacheWindow, writer != origWriter); + + // setup the header fields: + Set hInfo = new HashSet(); + hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + // Phasing-specific INFO fields: + hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + hInfo.add(new VCFFormatHeaderLine(HP_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Read-backed phasing haplotype identifiers")); + hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?")); + + // todo -- fix samplesToPhase + String trackName = variantCollection.variants.getName(); + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + Set vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); + writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); + + Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + readSamples.retainAll(vcfSamples); + if (readSamples.isEmpty()) { + String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; + if (permitNoSampleOverlap) + logger.warn(noPhaseString); + else + throw new UserException(noPhaseString); + } + } + + public PhasingStats reduceInit() { + return new PhasingStats(); + } + + /** + * For each site of interest, cache the current site and then use the cache to phase all sites + * for which "sufficient" information has already been observed. + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. + */ + public PhasingStatsAndOutput map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker == null) + return null; + + mostDownstreamLocusReached = ref.getLocus(); + if (DEBUG) logger.debug("map() at: " + mostDownstreamLocusReached); + + PhasingStats phaseStats = new PhasingStats(); + List unprocessedList = new LinkedList(); + + for (VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { + if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase); + + if (ReadBackedPhasing.processVariantInPhasing(vc)) { + VariantAndReads vr = new VariantAndReads(vc, context); + unphasedSiteQueue.add(vr); + + if (DEBUG) + logger.debug("Added variant to queue = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); + } + else { + unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly + + if (DEBUG) + logger.debug("Unprocessed variant = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + } + + int numReads = context.getBasePileup().getNumberOfElements(); + PhasingStats addInPhaseStats = new PhasingStats(numReads, 1); + phaseStats.addIn(addInPhaseStats); + } + + List completedList = processQueue(phaseStats, false); + completedList.addAll(unprocessedList); // add unprocessedList on to the END of completedList so that the processQueue() results, which are necessarily more upstream, are first! + + return new PhasingStatsAndOutput(phaseStats, completedList); + } + + private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); + + private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { +// for ( String sample : samplesToPhase ) +// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); + VariantContext subvc = vc.subContextFromSamples(samplesToPhase); +// logger.debug("original VC = " + vc); +// logger.debug("sub VC = " + subvc); + return GATKVariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); + } + + // Phase all "waiting" genotypes in the unphasedSiteQueue, but only if we have sufficient downstream genotypes with which to phase them + private List processQueue(PhasingStats phaseStats, boolean processAll) { + List oldPhasedList = new LinkedList(); + + while (!unphasedSiteQueue.isEmpty()) { + if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue + VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant; + if (startDistancesAreInWindowRange(mostDownstreamLocusReached, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextToPhaseVc))) { + /* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc + (note that we ASSUME that the VCF is ordered by ). + Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself. + */ + break; + } + // Already saw all variant positions within cacheWindow startDistance ahead of vc (on its contig) + } + // Update partiallyPhasedSites before it's used in phaseSite: + oldPhasedList.addAll(discardIrrelevantPhasedSites()); + if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList)); + + VariantAndReads vr = unphasedSiteQueue.remove(); + if (DEBUG) + logger.debug("Performing phasing for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); + phaseSite(vr, phaseStats); + } + + // Update partiallyPhasedSites after phaseSite is done: + oldPhasedList.addAll(discardIrrelevantPhasedSites()); + if (DEBUG) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList)); + + if (outputMultipleBaseCountsWriter != null) + outputMultipleBaseCountsWriter.outputMultipleBaseCounts(); + + return oldPhasedList; + } + + // Flush out sites with (possibly) phased genotypes, if those sites are no longer needed to phase other downstream sites + private List discardIrrelevantPhasedSites() { + List vcList = new LinkedList(); + + GenomeLoc nextToPhaseLoc = null; + if (!unphasedSiteQueue.isEmpty()) + nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); + + while (!partiallyPhasedSites.isEmpty()) { + if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past" + UnfinishedVariantAndReads partPhasedVr = partiallyPhasedSites.peek(); + + if (startDistancesAreInWindowRange(partPhasedVr.unfinishedVariant.getLocation(), nextToPhaseLoc)) + // nextToPhaseLoc is still not far enough ahead of partPhasedVr to exclude partPhasedVr from calculations + break; + } + UnfinishedVariantAndReads uvr = partiallyPhasedSites.remove(); + vcList.add(uvr.unfinishedVariant.toVariantContext()); + } + + return vcList; + } + + /* Phase vc (removed head of unphasedSiteQueue) using all VariantContext objects in + partiallyPhasedSites, and all in unphasedSiteQueue that are within cacheWindow startDistance ahead of vc (on its contig). + + ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue). + */ + + private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { + VariantContext vc = vr.variant; + logger.debug("Will phase vc = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + + UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr); + UnfinishedVariantContext uvc = uvr.unfinishedVariant; + + // Perform per-sample phasing: + GenotypesContext sampGenotypes = vc.getGenotypes(); + Map samplePhaseStats = new TreeMap(); + for (final Genotype gt : sampGenotypes) { + String samp = gt.getSampleName(); + + if (DEBUG) logger.debug("sample = " + samp); + if (isUnfilteredCalledDiploidGenotype(gt)) { + if (gt.isHet()) { // Attempt to phase this het genotype relative to *SOME* previous het genotype: + + // Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites): + List prevHetGenotypes = new LinkedList(); + CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); + while (phasedIt.hasNext()) { + UnfinishedVariantAndReads phasedVr = phasedIt.next(); + Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp); + if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation()); + prevHetGenotypes.add(grb); + if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); + } + } + + SNPallelePair allelePair = new SNPallelePair(gt); + if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); + + boolean phasedCurGenotypeRelativeToPrevious = false; + for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) { + PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets); + + PhaseResult pr = phaseSampleAtSite(phaseWindow); + phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality); + + if (pr.phasingContainsInconsistencies) { + if (DEBUG) + logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + uvc.setPhasingInconsistent(); + } + + if (phasedCurGenotypeRelativeToPrevious) { + Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype(); + SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype); + if (!prevHetGenotype.hasAnyAttribute(HP_KEY)) + throw new ReviewedStingException("Internal error: missing haplotype markings for previous genotype, even though we put it there..."); + String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(HP_KEY); + + String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype); + Genotype phasedGt = new GenotypeBuilder(gt) + .alleles(allelePair.getAllelesAsList()) + .attribute(PQ_KEY, pr.phaseQuality) + .attribute(HP_KEY, curPairNames) + .make(); + uvc.setGenotype(samp, phasedGt); + + if (DEBUG) { + logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]); + logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n"); + + logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]); + logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n"); + logger.debug("\n"); + } + } + + if (statsWriter != null) { + GenomeLoc prevLoc = null; + int curIndex = 0; + for (GenotypeAndReadBases grb : prevHetGenotypes) { + if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) { + prevLoc = grb.loc; + break; + } + ++curIndex; + } + statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length); + } + + PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); + if (sampPhaseCounts == null) { + sampPhaseCounts = new PhaseCounts(); + samplePhaseStats.put(samp, sampPhaseCounts); + } + sampPhaseCounts.numTestedSites++; + + if (pr.phasingContainsInconsistencies) { + if (phasedCurGenotypeRelativeToPrevious) + sampPhaseCounts.numInconsistentSitesPhased++; + else + sampPhaseCounts.numInconsistentSitesNotPhased++; + } + + if (phasedCurGenotypeRelativeToPrevious) + sampPhaseCounts.numPhased++; + + // Phased current relative to *SOME* previous het genotype, so break out of loop: + if (phasedCurGenotypeRelativeToPrevious) + break; + } + + if (!phasedCurGenotypeRelativeToPrevious) { // Either no previous hets, or unable to phase relative to any previous het: + String locStr = Integer.toString(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc).getStart()); + + Genotype startNewHaplotypeGt = new GenotypeBuilder(gt) + .attribute(HP_KEY, new String[]{locStr + "-1", locStr + "-2"}) + .make(); + + uvc.setGenotype(samp, startNewHaplotypeGt); + } + } + } + } + + partiallyPhasedSites.add(uvr); // only add it in now, since don't want it to be there during phasing + phaseStats.addIn(new PhasingStats(samplePhaseStats)); + } + + public boolean passesPhasingThreshold(double PQ) { + return PQ >= phaseQualityThresh; + } + + // A genotype and the base pileup that supports it + private static class GenotypeAndReadBases { + public Genotype genotype; + public ReadBasesAtPosition readBases; + public GenomeLoc loc; + + public GenotypeAndReadBases(Genotype genotype, ReadBasesAtPosition readBases, GenomeLoc loc) { + this.genotype = genotype; + this.readBases = readBases; + this.loc = loc; + } + } + + // Object to represent the local window of het genotypes for which haplotypes are being scored and ranked + private class PhasingWindow { + private Genotype[] hetGenotypes = null; + + private int phaseRelativeToIndex = -1; + private int phasingSiteIndex = -1; + + private Map readsAtHetSites = null; + + public Genotype phaseRelativeToGenotype() { + return hetGenotypes[phaseRelativeToIndex]; + } + + // ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)] + + public PhasingWindow(VariantAndReads vr, String sample, List prevHetGenotypes, int goBackFromEndOfPrevHets) { + if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase + throw new ReviewedStingException("Should never get empty set of previous sites to phase against"); + + // Include these previously phased sites in the phasing computation: + List listHetGenotypes = new LinkedList(prevHetGenotypes); + + phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets; + phasingSiteIndex = listHetGenotypes.size(); + + // Add the (het) position to be phased [at phasingSiteIndex]: + GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant); + GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); + listHetGenotypes.add(grbPhase); + if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); + + // Include as-of-yet unphased sites in the phasing computation: + for (VariantAndReads nextVr : unphasedSiteQueue) { + if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc + break; + Genotype gt = nextVr.variant.getGenotype(sample); + if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant)); + listHetGenotypes.add(grb); + if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc); + } + } + + // First, assemble the "sub-reads" from the COMPLETE WINDOW-BASED SET of heterozygous positions for this sample: + buildReadsAtHetSites(listHetGenotypes, sample, grbPhase.loc); + + // Remove extraneous reads (those that do not "connect" the two core phasing sites): + Set onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); + + // Dynamically modify the window to only include sites which have a non-empty set of reads: + listHetGenotypes = removeExtraneousSites(listHetGenotypes); + + // In any case, must still trim the window size to be "feasible" + // [**NOTE**: May want to do this to try maximize the preservation of paths from phaseRelativeToIndex to phasingSiteIndex]: + if (listHetGenotypes.size() > maxPhaseSites) { + listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); + + // Can now remove any extra reads (and then sites): + buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); + onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); + listHetGenotypes = removeExtraneousSites(listHetGenotypes); + } + + // Lastly, assemble the "sub-reads" from the FINAL SET of heterozygous positions for this sample: + buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); + + // Copy to a fixed-size array: + if (DEBUG) logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); + hetGenotypes = new Genotype[listHetGenotypes.size()]; + int index = 0; + for (GenotypeAndReadBases copyGrb : listHetGenotypes) + hetGenotypes[index++] = copyGrb.genotype; + } + + // Build the read sub-sequences at the het genomic positions: + private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc) { + buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null); + } + + private void buildReadsAtHetSites(List listHetGenotypes, Set onlyKeepReads) { + buildReadsAtHetSites(listHetGenotypes, null, null, onlyKeepReads); + } + + private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc, Set onlyKeepReads) { + readsAtHetSites = new HashMap(); + + int index = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + ReadBasesAtPosition readBases = grb.readBases; + if (readBases != null) { + for (ReadBase rb : readBases) { + String readName = rb.readName; + if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads + continue; + + PhasingRead rd = readsAtHetSites.get(readName); + if (rd == null) { + rd = new PhasingRead(listHetGenotypes.size(), rb.mappingQual); + readsAtHetSites.put(readName, rd); + } + else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index + && sample != null && phasingLoc != null) { + outputMultipleBaseCountsWriter.setMultipleBases(new SampleReadLocus(sample, readName, grb.loc), phasingLoc, rd.getBase(index), rb.base); + } + + // Arbitrarily updates to the last base observed for this sample and read (rb.base): + rd.updateBaseAndQuality(index, rb.base, rb.baseQual); + } + } + index++; + } + if (DEBUG) logger.debug("Number of sites in window = " + index); + + if (DEBUG && logger.isDebugEnabled()) { + logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:"); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + PhasingRead rd = nameToReads.getValue(); + logger.debug(rd + "\t" + rdName); + } + } + } + + // Object to represent a pair of genomic sites, and all reads overlapping those 2 sites (though possibly others) + private class EdgeToReads { + private TreeMap> edgeReads; + + public EdgeToReads() { + this.edgeReads = new TreeMap>(); // implemented GraphEdge.compareTo() + } + + public void addRead(PhasingGraphEdge e, String readName) { + List reads = edgeReads.get(e); + if (reads == null) { + reads = new LinkedList(); + edgeReads.put(e, reads); + } + reads.add(readName); + } + + public List getReads(PhasingGraphEdge e) { + return edgeReads.get(e); + } + } + + private class IntegerSet implements Iterable { + private Set list; + + public IntegerSet(Set list) { + this.list = list; + } + + public boolean contains(int i) { + return list.contains(i); + } + + public Iterator iterator() { + return list.iterator(); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i : this) { + sb.append(i + ", "); + } + return sb.toString(); + } + } + + // Remove any reads that add no "connections" (PhasingGraphEdge) between pairs of het sites: + public Set removeExtraneousReads(int numHetSites) { + PhasingGraph readGraph = new PhasingGraph(numHetSites); + EdgeToReads edgeToReads = new EdgeToReads(); + Set sitesWithEdges = new TreeSet(); + + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + PhasingRead rd = nameToReads.getValue(); + + int[] siteInds = rd.getNonNullIndices(); + // Connect each pair of non-null sites in rd: + for (int i = 0; i < siteInds.length; i++) { + for (int j = i + 1; j < siteInds.length; j++) { + PhasingGraphEdge e = new PhasingGraphEdge(siteInds[i], siteInds[j]); + if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e); + readGraph.addEdge(e); + + edgeToReads.addRead(e, rdName); + + sitesWithEdges.add(e.getV1()); + sitesWithEdges.add(e.getV2()); + } + } + } + if (DEBUG) logger.debug("Read graph:\n" + readGraph); + Set keepReads = new HashSet(); + + /* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex): + + In detail: + Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph". + A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by + considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and + cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant. + By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example, + if a path exists from prev ---> v1 that excludes v2. + + Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well. + And, if the path uses edges from other reads, then keeping all other reads that contribute those edges + [which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist. + + NOTE: + If we would use NON-UNIFORM priors for the various haplotypes consistent with a margnialized haplotype, then this calculation would not be correct, since the equivalence of: + 1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values). + 2. The read has edges involved in a path from prev ---> cur. + DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior. + + This is due to the following: + [We denote: + R = set of all reads + r = a single read + "AA + CC" = AA on top chromosome, CC on bottom chromosome] + + Note that since there are only two haplotype possibilities: + P(AA + CC | R) + P(AC + CA | R) = 1 + + Now, if we assume that all haplotypes consistent with AA + CC have the same prior probability [P(AA + CC | R)], then: + P(AA + CC | R) + = P(AAAA + CCCC | R) + ... + P(AACC + CCAA | R) + = [P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R)] / P(R) + \propto P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R) + = P(R | AAAA + CCCC)*P(AAAA + CCCC) + ... + P(R | AACC + CCAA)*P(AACC + CCAA) + = P(AA + CC | R) * [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] + + Since we assume independence between reads given a particular haplotype [P(R | AAAA + CCCC) = \prod_r P(r | AAAA + CCCC)], + a new read r affects P(AA + CC | R) by multiplying each of the terms in the sum by, e.g., P(r | AAAA + CCCC). + Therefore, if these values do not affect the ratio of: + (I) [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] / [P(R | ACAA + CACC) + ... + P(R | ACCC + CAAA)] + then they do not affect the value of: + (II) P(AA + CC | R) / P(AC + CA | R) [which uniquely defines their values, since they sum to 1] + + And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R. + */ + int prev = phaseRelativeToIndex; + int cur = phasingSiteIndex; + + if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev + if (DEBUG) + logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]"); + readsAtHetSites.clear(); + return keepReads; + } + + /* Check the connected components of prev and cur when removing each individual vertex's edges: + [Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)] + */ + IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites]; + IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites]; + for (int i : sitesWithEdges) { + if (DEBUG) logger.debug("Calculating CC after removing edges of site: " + i); + + // Remove all edges incident to i and see which positions have paths to prev and cur: + Collection removedEdges = readGraph.removeAllIncidentEdges(i); + + // Run-time for efficiently calculating connected components using DisjointSet: O(E) + DisjointSet ccAfterRemove = readGraph.getConnectedComponents(); + removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges)); + removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges)); + + if (DEBUG) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]); + if (DEBUG) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]); + + // Add the removed edges back in: + readGraph.addEdges(removedEdges); + } + + for (PhasingGraphEdge e : readGraph) { + if (DEBUG) logger.debug("Testing the path-connectivity of Edge: " + e); + + /* Edge e={v1,v2} contributes a path between prev and cur for testRead iff: + testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa]. + Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges, + and the path from v2 ---> cur will NOT contain v1. + */ + boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.getV1()].contains(e.getV2()) && removedSiteSameCCAsCur[e.getV2()].contains(e.getV1()); + boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.getV2()].contains(e.getV1()) && removedSiteSameCCAsCur[e.getV1()].contains(e.getV2()); + + if (prevTo2and1ToCur || prevTo1and2ToCur) { + for (String readName : edgeToReads.getReads(e)) { + keepReads.add(readName); + + if (DEBUG && logger.isDebugEnabled()) { + if (prevTo2and1ToCur) + logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV2() + " -> " + e.getV1() + " ---> " + cur); + else + logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV1() + " -> " + e.getV2() + " ---> " + cur); + } + } + } + } + + // Retain only the reads that contain an edge in a path connecting prev and cur: + Iterator> readIt = readsAtHetSites.entrySet().iterator(); + while (readIt.hasNext()) { + Map.Entry nameToReads = readIt.next(); + String rdName = nameToReads.getKey(); + if (!keepReads.contains(rdName)) { + readIt.remove(); + if (DEBUG) logger.debug("Removing extraneous read: " + rdName); + } + } + + return keepReads; + } + + // Remove all het sites that have no reads (which may occur if all of the reads supporting the original call don't contain an additional het site and were thus removed above): + private List removeExtraneousSites(List listHetGenotypes) { + Set sitesWithReads = new HashSet(); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + PhasingRead rd = nameToReads.getValue(); + for (int i : rd.getNonNullIndices()) + sitesWithReads.add(i); + } + + // Remove all sites that have no read bases: + List keepHetSites = new LinkedList(); + int index = 0; + int numPrecedingPhaseRelativeToSiteRemoved = 0; + int numPrecedingPhasingSiteRemoved = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + boolean keepSite = sitesWithReads.contains(index); + if (DEBUG && logger.isDebugEnabled() && !keepSite) + logger.debug("Removing read-less site " + grb.loc); + + if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) { + keepHetSites.add(grb); + if (!keepSite) + if (DEBUG) + logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); + } + else { + if (index <= phaseRelativeToIndex) + numPrecedingPhaseRelativeToSiteRemoved++; + if (index <= phasingSiteIndex) + numPrecedingPhasingSiteRemoved++; + } + + index++; + } + + phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved; + phasingSiteIndex -= numPrecedingPhasingSiteRemoved; + return keepHetSites; + } + + /* Auxilary object to sort candidate het sites with which to phase the index site, + where sorting is performed based on distance to the index site + (since presumably closer sites will have greater numbers of overlapping reads) + */ + private class SortSitesBySumOfDist implements Comparator { + private Vector grb; + + public SortSitesBySumOfDist(List listHetGenotypes) { + grb = new Vector(listHetGenotypes); + } + + public int compare(Integer i1, Integer i2) { + int d1 = calcGenomicDist(i1); + int d2 = calcGenomicDist(i2); + + if (d1 != d2) + return d1 - d2; + + int id1 = calcIndexDist(i1); + int id2 = calcIndexDist(i2); + if (id1 != id2) + return id1 - id2; + + return i1 - i2; + } + + private int calcGenomicDist(int i) { + int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc); + int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc); + + return d1 + d2; + } + + private int calcIndexDist(int i) { + int d1 = Math.abs(i - phaseRelativeToIndex); + int d2 = Math.abs(i - phasingSiteIndex); + + return d1 + d2; + } + } + + // Create a "phasing window" of het sites to use for phasing the index site, but limiting to only maxPhaseSites het sites to incorporate [as specified by the user] + private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { + if (DEBUG) + logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); + + Set scoreAllIndices = new TreeSet(new SortSitesBySumOfDist(listHetGenotypes)); + for (int i = 0; i < listHetGenotypes.size(); ++i) { + if (i != phaseRelativeToIndex && i != phasingSiteIndex) + scoreAllIndices.add(i); + } + + Set keepIndices = new TreeSet(); + // always keep these two indices: + keepIndices.add(phaseRelativeToIndex); + keepIndices.add(phasingSiteIndex); + for (int addInd : scoreAllIndices) { + if (keepIndices.size() >= maxPhaseSites) + break; + else // keepIndices.size() < maxPhaseSites + keepIndices.add(addInd); + } + + List newListHetGenotypes = new LinkedList(); + int newPhaseRelativeToIndex = -1; + int newPhasingSiteIndex = -1; + int oldIndex = 0; + int newIndex = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + if (keepIndices.contains(oldIndex)) { + newListHetGenotypes.add(grb); + + if (oldIndex == phaseRelativeToIndex) + newPhaseRelativeToIndex = newIndex; + if (oldIndex == phasingSiteIndex) + newPhasingSiteIndex = newIndex; + + ++newIndex; + } + ++oldIndex; + } + + phaseRelativeToIndex = newPhaseRelativeToIndex; + phasingSiteIndex = newPhasingSiteIndex; + listHetGenotypes = newListHetGenotypes; + if (DEBUG) + logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); + + return listHetGenotypes; + } + } + + // Phase a particular sample's het genotype using a constructed PhasingWindow: + private PhaseResult phaseSampleAtSite(PhasingWindow phaseWindow) { + /* Will map a phase and its "complement" to a single representative phase, + and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]: + */ + int[] marginalizeInds = {phaseWindow.phaseRelativeToIndex, phaseWindow.phasingSiteIndex}; + HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, marginalizeInds); + PhasingTable sampleHaps = tabCreator.getNewTable(); + + if (DEBUG && logger.isDebugEnabled()) { + logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size()); + logger.debug("USED READS:"); + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + PhasingRead rd = nameToReads.getValue(); + logger.debug(rd + "\t" + rdName); + } + } + + // Update the phasing table based on each of the sub-reads for this sample: + MaxHaplotypeAndQuality prevMaxHapAndQual = null; + + int numHighQualityIterations = 0; + int numInconsistentIterations = 0; + + double totalAbsPQchange = 0; + int numPQchangesObserved = 0; + + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + PhasingRead rd = nameToReads.getValue(); + if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); + + for (PhasingTable.PhasingTableEntry pte : sampleHaps) { + PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); + pte.getScore().integrateReadScore(score); + if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); + } + + // Check the current best haplotype assignment and compare it to the previous one: + MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); + if (DEBUG) + logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); + if (prevMaxHapAndQual != null) { + double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; + + if (passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) { + numHighQualityIterations++; + if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase + (numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ + if (DEBUG) logger.debug("Inconsistent read found!"); + numInconsistentIterations++; + } + } + + totalAbsPQchange += Math.abs(changeInPQ); + numPQchangesObserved++; + } + prevMaxHapAndQual = curMaxHapAndQual; + } + + if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); + MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, DEBUG); + double posteriorProb = maxHapQual.maxEntry.getScore().getValue(); + + if (DEBUG) + logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality); + if (DEBUG) + logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations); + + boolean phasingContainsInconsistencies = false; + if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS) + phasingContainsInconsistencies = true; + + return new PhaseResult(maxHapQual.getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies); + } + + // Object represents the maximum-scoring haplotype and its corresponding quality score + private static class MaxHaplotypeAndQuality { + public PhasingTable.PhasingTableEntry maxEntry; + public double phaseQuality; + + public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) { + // Marginalize each haplotype to its first 2 positions: + hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable); + if (printDebug) + logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n"); + + calculateMaxHapAndPhasingQuality(hapTable, printDebug); + } + + // Calculates maxEntry and its PQ (within table hapTable): + private void calculateMaxHapAndPhasingQuality(PhasingTable hapTable, boolean printDebug) { + hapTable.normalizeScores(); + if (printDebug) + logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n"); + + // Determine the phase at this position: + this.maxEntry = hapTable.maxEntry(); + + // convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.trueProbToQual(posteriorProb): + PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO); + for (PhasingTable.PhasingTableEntry pte : hapTable) { + if (pte != maxEntry) + sumErrorProbs.plusEqual(pte.getScore()); + } + this.phaseQuality = -10.0 * (sumErrorProbs.getLog10Value()); + } + + // Comparator that compares if 2 haplotypes map back to the same "representative" haplotype (accounts for reverse complementarity) + public boolean hasSameRepresentativeHaplotype(MaxHaplotypeAndQuality that) { + return this.getRepresentative().equals(that.getRepresentative()); + } + + private Haplotype getRepresentative() { + return maxEntry.getHaplotypeClass().getRepresentative(); + } + } + + /* + Ensure that curAllelePair is phased relative to prevAllelePair as specified by hap. + */ + + public static String[] ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, String[] prevPairNames, Haplotype hap) { + if (hap.size() < 2) + throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!"); + + String[] curPairNames = prevPairNames; + + byte prevBase = hap.getBase(0); // The 1st base in the haplotype + byte curBase = hap.getBase(1); // The 2nd base in the haplotype + + boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); + boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); + if (chosePrevTopChrom != choseCurTopChrom) { + //curAllelePair.swapAlleles(); + + /* Instead of swapping the alleles (as we used to above), + we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype: + */ + curPairNames = new String[]{prevPairNames[1], prevPairNames[0]}; + } + + return curPairNames; + } + + private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { + return startDistancesAreInWindowRange(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc1), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); + } + + private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) { + return loc1.distance(loc2) <= cacheWindow; // distance() checks: loc1.onSameContig(loc2) + } + + private int startDistance(GenomeLoc gl1, VariantContext vc2) { + return gl1.distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); + } + + public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) { + if (statsAndList != null) { + writeVcList(statsAndList.output); + stats.addIn(statsAndList.ps); + } + return stats; + } + + /** + * Phase anything left in the cached unphasedSiteQueue, and report the number of reads and VariantContexts processed. + * + * @param result the number of reads and VariantContexts seen. + */ + public void onTraversalDone(PhasingStats result) { + List finalList = processQueue(result, true); // process all remaining data + writeVcList(finalList); + writer.close(); + + if (statsWriter != null) + statsWriter.close(); + + if (outputMultipleBaseCountsWriter != null) + outputMultipleBaseCountsWriter.close(); + + System.out.println("Coverage over ALL samples:"); + System.out.println("Number of reads observed: " + result.getNumReads()); + System.out.println("Number of variant sites observed: " + result.getNumVarSites()); + System.out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites())); + + System.out.println("\n--- Phasing summary [minimal haplotype quality (PQ): " + phaseQualityThresh + ", maxPhaseSites: " + maxPhaseSites + ", cacheWindow: " + cacheWindow + "] ---"); + for (Map.Entry sampPhaseCountEntry : result.getPhaseCounts()) { + PhaseCounts pc = sampPhaseCountEntry.getValue(); + System.out.print("Sample: " + sampPhaseCountEntry.getKey() + "\tSites tested: " + pc.numTestedSites + "\tSites phased: " + pc.numPhased); + System.out.println("\tPhase-inconsistent sites: " + (pc.numInconsistentSitesPhased + pc.numInconsistentSitesNotPhased) + " [phased: " + pc.numInconsistentSitesPhased + ", unphased:" + pc.numInconsistentSitesNotPhased + "]"); + } + System.out.println(""); + } + + private void writeVcList(List varContList) { + for (VariantContext vc : varContList) + writeVCF(vc); + } + + private void writeVCF(VariantContext vc) { + if (samplesToPhase == null || vc.isNotFiltered()) + //if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant + writer.add(vc); + } + + public static boolean processVariantInPhasing(VariantContext vc) { + return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || !vc.isVariant()); // we can handle the non-variant case as well + //return isUnfilteredBiallelicSNP(vc); + } + + + /* + Inner classes: + */ + + // A variant and the reads for each sample at that site: + private class VariantAndReads { + public VariantContext variant; + public HashMap sampleReadBases; + + public VariantAndReads(VariantContext variant, HashMap sampleReadBases) { + this.variant = variant; + this.sampleReadBases = sampleReadBases; + } + + public VariantAndReads(VariantContext variant, AlignmentContext alignment) { + this.variant = variant; + this.sampleReadBases = new HashMap(); + + if (alignment != null) { + ReadBackedPileup pileup = alignment.getBasePileup(); + if (pileup != null) { + // filter the read-base pileup based on min base and mapping qualities: + pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE); + if (pileup != null) { + for (final String sample : pileup.getSamples()) { + ReadBackedPileup samplePileup = pileup.getPileupForSample(sample); + ReadBasesAtPosition readBases = new ReadBasesAtPosition(); + for (PileupElement p : samplePileup) { + if (!p.isDeletion()) // IGNORE deletions for now + readBases.putReadBase(p); + } + sampleReadBases.put(sample, readBases); + } + } + } + } + } + } + + // Object to represent a variant that has yet to be phased, along with its underlying base pileups: + private class UnfinishedVariantAndReads { + public UnfinishedVariantContext unfinishedVariant; + public HashMap sampleReadBases; + + public UnfinishedVariantAndReads(VariantAndReads vr) { + this.unfinishedVariant = new UnfinishedVariantContext(vr.variant); + this.sampleReadBases = vr.sampleReadBases; + } + } + + // COULD replace with MutableVariantContext if it worked [didn't throw exceptions when trying to call its set() methods]... + + private class UnfinishedVariantContext implements HasGenomeLocation { + private String name; + private String contig; + private int start; + private int stop; + private Collection alleles; + private Map genotypes; + private double log10PError; + private Set filters; + private Map attributes; + private String id; + + public UnfinishedVariantContext(VariantContext vc) { + this.name = vc.getSource(); + this.id = vc.getID(); + this.contig = vc.getChr(); + this.start = vc.getStart(); + this.stop = vc.getEnd(); + this.alleles = vc.getAlleles(); + + this.genotypes = new HashMap(); + for ( final Genotype g : vc.getGenotypes() ) { + this.genotypes.put(g.getSampleName(), g); + } + + this.log10PError = vc.getLog10PError(); + this.filters = vc.filtersWereApplied() ? vc.getFilters() : null; + this.attributes = new HashMap(vc.getAttributes()); + } + + public VariantContext toVariantContext() { + GenotypesContext gc = GenotypesContext.copy(this.genotypes.values()); + return new VariantContextBuilder(name, contig, start, stop, alleles).id(id) + .genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make(); + } + + public GenomeLoc getLocation() { + return getToolkit().getGenomeLocParser().createGenomeLoc(contig, start, stop); + } + + public Genotype getGenotype(String sample) { + return genotypes.get(sample); + } + + public void setGenotype(String sample, Genotype newGt) { + this.genotypes.put(sample, newGt); + } + + public void setPhasingInconsistent() { + attributes.put(PHASING_INCONSISTENT_KEY, true); + } + } + + private static String toStringGRL(List grbList) { + boolean first = true; + StringBuilder sb = new StringBuilder(); + for (GenotypeAndReadBases grb : grbList) { + if (first) + first = false; + else + sb.append(" -- "); + + sb.append(grb.loc); + } + return sb.toString(); + } + + private String toStringVCL(List vcList) { + boolean first = true; + StringBuilder sb = new StringBuilder(); + for (VariantContext vc : vcList) { + if (first) + first = false; + else + sb.append(" -- "); + + sb.append(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + } + return sb.toString(); + } + +// +// THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases() +// FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH +// AN ArrayList OF Allele [OR SIMILAR OBJECT], and WON'T USE: getSingleBase(alleleI) +// + + /* Creates table of all 2^n local haplotypes, + where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads + */ + private static abstract class HaplotypeTableCreator { + protected Genotype[] genotypes; + + public HaplotypeTableCreator(Genotype[] hetGenotypes) { + this.genotypes = hetGenotypes; + } + + abstract public PhasingTable getNewTable(); + + protected List getAllHaplotypes() { + int numSites = genotypes.length; + int[] genotypeCards = new int[numSites]; + for (int i = 0; i < numSites; i++) + genotypeCards[i] = genotypes[i].getPloidy(); + + LinkedList allHaps = new LinkedList(); + CardinalityCounter alleleCounter = new CardinalityCounter(genotypeCards); + for (int[] alleleInds : alleleCounter) { + byte[] hapBases = new byte[numSites]; + for (int i = 0; i < numSites; i++) { + Allele alleleI = genotypes[i].getAllele(alleleInds[i]); + hapBases[i] = SNPallelePair.getSingleBase(alleleI); + } + allHaps.add(new Haplotype(hapBases)); + } + return allHaps; + } + + /* For phasing site X relative to site X-1, we sum the probabilities over all haplotypes of the phases of [X-1, X]. + That is, we aggregate probability mass over all haplotypes consistent with a particular phase at the [X-1, X] pair. + */ + public static PhasingTable marginalizeAsNewTable(PhasingTable table) { + TreeMap hapMap = new TreeMap(); + for (PhasingTable.PhasingTableEntry pte : table) { + Haplotype rep = pte.getHaplotypeClass().getRepresentative(); + PreciseNonNegativeDouble score = hapMap.get(rep); + if (score == null) { + score = new PreciseNonNegativeDouble(ZERO); + hapMap.put(rep, score); + } + score.plusEqual(pte.getScore()); + } + + PhasingTable margTable = new PhasingTable(); + for (Map.Entry hapClassAndScore : hapMap.entrySet()) { + Haplotype rep = hapClassAndScore.getKey(); + ArrayList hapList = new ArrayList(); + hapList.add(rep); + + HaplotypeClass hc = new HaplotypeClass(hapList, rep); + margTable.addEntry(hc, hapClassAndScore.getValue()); + } + return margTable; + } + } + + // Implementation for diploid alleles (thus assuming 2^n haplotypes): + private static class TableCreatorOfHaplotypeAndComplementForDiploidAlleles extends HaplotypeTableCreator { + private SNPallelePair[] SNPallelePairs; + Set marginalizeInds; + + public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) { + super(hetGenotypes); + + this.SNPallelePairs = new SNPallelePair[genotypes.length]; + for (int i = 0; i < genotypes.length; i++) + SNPallelePairs[i] = new SNPallelePair(genotypes[i]); + + this.marginalizeInds = new TreeSet(); + for (int mind : marginalizeInds) + this.marginalizeInds.add(mind); + } + + public PhasingTable getNewTable() { + int startIndex = marginalizeInds.iterator().next(); + + PhasingTable table = new PhasingTable(); + for (Haplotype hap : getAllHaplotypes()) { + if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) { + /* hap is the "representative" haplotype [DEFINED here to be + the one with the top base at the startIndex position. + NOTE that it is CRITICAL that this definition be consistent with the representative sub-haplotypes defined below!] + */ + ArrayList hapList = new ArrayList(); + hapList.add(hap); + hapList.add(complement(hap)); + + Haplotype rep = hap.subHaplotype(marginalizeInds); + double hapClassPrior = getHaplotypeRepresentativePrior(rep); // Note that prior is ONLY a function of the representative haplotype + + HaplotypeClass hapClass = new HaplotypeClass(hapList, rep); + table.addEntry(hapClass, hapClassPrior); + } + } + return table; + } + + // Can change later to weight the representative Haplotypes differently: + + private double getHaplotypeRepresentativePrior(Haplotype rep) { + return 1.0; + } + + /* Since assuming biallelic genotypes, we use this to map a haplotype to the corresponding haplotype, + where the other allele is chosen at each site + */ + private Haplotype complement(Haplotype hap) { + int numSites = SNPallelePairs.length; + if (hap.size() != numSites) + throw new ReviewedStingException("INTERNAL ERROR: hap.size() != numSites"); + + // Take the other base at EACH position of the Haplotype: + byte[] complementBases = new byte[numSites]; + for (int i = 0; i < numSites; i++) + complementBases[i] = SNPallelePairs[i].getOtherBase(hap.getBase(i)); + + return new Haplotype(complementBases); + } + } + + // Table to represent the list of all haplotypes and their scores: + private static class PhasingTable implements Iterable { + private LinkedList table; + + public PhasingTable() { + this.table = new LinkedList(); + } + + public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, PreciseNonNegativeDouble initialScore) { + PhasingTableEntry pte = new PhasingTableEntry(haplotypeClass, new PhasingScore(initialScore)); + table.add(pte); + return pte; + } + + public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, double initialScore) { + return addEntry(haplotypeClass, new PreciseNonNegativeDouble(initialScore)); + } + + public Iterator iterator() { + return table.iterator(); + } + + public boolean isEmpty() { + return table.isEmpty(); + } + + public PhasingTableEntry maxEntry() { + if (table.isEmpty()) + return null; + + PhasingTableEntry maxPte = null; + for (PhasingTableEntry pte : table) { + if (maxPte == null || pte.getScore().gt(maxPte.getScore())) { + maxPte = pte; + } + } + return maxPte; + } + + // Normalize all the scores of the phasing table by their sum total: + public void normalizeScores() { + PreciseNonNegativeDouble normalizeBy = new PreciseNonNegativeDouble(ZERO); + for (PhasingTableEntry pte : table) + normalizeBy.plusEqual(pte.getScore()); + + if (!normalizeBy.equals(ZERO)) { // prevent precision problems + for (PhasingTableEntry pte : table) + pte.getScore().divEqual(normalizeBy); + } + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("-------------------\n"); + for (PhasingTableEntry pte : this) { + sb.append("Haplotypes:\t" + pte.getHaplotypeClass() + "\tScore:\t" + pte.getScore() + "\n"); + } + sb.append("-------------------\n"); + return sb.toString(); + } + + // An entry in the phasing table for a particular set of equivalent haplotypes (e.g., a haplotype and its "complement" -- see above) + public static class PhasingTableEntry implements Comparable { + private HaplotypeClass haplotypeClass; + private PhasingScore score; + + public PhasingTableEntry(HaplotypeClass haplotypeClass, PhasingScore score) { + this.haplotypeClass = haplotypeClass; + this.score = score; + } + + public HaplotypeClass getHaplotypeClass() { + return haplotypeClass; + } + + public PhasingScore getScore() { + return score; + } + + public int compareTo(PhasingTableEntry that) { + return this.getScore().compareTo(that.getScore()); + } + } + } + + private static class PhaseResult { + public Haplotype haplotype; + public double phaseQuality; + public boolean phasingContainsInconsistencies; + + public PhaseResult(Haplotype haplotype, double phaseQuality, boolean phasingContainsInconsistencies) { + this.haplotype = haplotype; + this.phaseQuality = phaseQuality; + this.phasingContainsInconsistencies = phasingContainsInconsistencies; + } + } + + public static boolean isUnfilteredBiallelicSNP(VariantContext vc) { + return (vc.isNotFiltered() && vc.isSNP() && vc.isBiallelic()); + } + + public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) { + return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2); + } + + // Class to output verbose information on instances where a single read has multiple bases at the same position (e.g., from paired-end overlap with a base error): + private class MultipleBaseCountsWriter { + private BufferedWriter writer = null; + private TreeMap multipleBaseCounts = null; + + public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) { + FileOutputStream output; + try { + output = new FileOutputStream(outputMultipleBaseCountsFile); + } catch (FileNotFoundException e) { + throw new RuntimeException("Unable to create multiple base count file at location: " + outputMultipleBaseCountsFile); + } + this.writer = new BufferedWriter(new OutputStreamWriter(output)); + + this.multipleBaseCounts = new TreeMap(); // implemented SampleReadLocus.compareTo() + } + + public void setMultipleBases(SampleReadLocus srl, GenomeLoc phasingLoc, byte prevBase, byte newBase) { + MultipleBaseCounts mbc = multipleBaseCounts.get(srl); + if (mbc == null) { + mbc = new MultipleBaseCounts(phasingLoc); + mbc.incrementBaseCount(prevBase); // only now, do we know to note this + multipleBaseCounts.put(srl, mbc); + } + if (mbc.samePhasingLocAs(phasingLoc)) // otherwise, don't want to count these multiple base counts again + mbc.incrementBaseCount(newBase); + + } + + public void outputMultipleBaseCounts() { + GenomeLoc nextToPhaseLoc = null; + if (!unphasedSiteQueue.isEmpty()) + nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); + + outputMultipleBaseCounts(nextToPhaseLoc); + } + + private void outputMultipleBaseCounts(GenomeLoc nextToPhaseLoc) { + try { + Iterator> multBaseCountIt = multipleBaseCounts.entrySet().iterator(); + while (multBaseCountIt.hasNext()) { + Map.Entry sampleReadLocBaseCountsEntry = multBaseCountIt.next(); + SampleReadLocus srl = sampleReadLocBaseCountsEntry.getKey(); + if (nextToPhaseLoc == null || !startDistancesAreInWindowRange(srl.getLocus(), nextToPhaseLoc)) { + // Done with entry, so print it and remove it from map: + writer.write(srl + "\t" + sampleReadLocBaseCountsEntry.getValue() + "\n"); + multBaseCountIt.remove(); + } + } + writer.flush(); + } catch (IOException e) { + throw new RuntimeException("Unable to write to outputMultipleBaseCountsFile", e); + } + } + + public void close() { + outputMultipleBaseCounts(null); + + try { + writer.flush(); + writer.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close outputMultipleBaseCountsFile"); + } + } + } +} + + +class PhasingScore extends PreciseNonNegativeDouble { + public PhasingScore(double score) { + super(score); + } + + public PhasingScore(PreciseNonNegativeDouble val) { + super(val); + } + + public PhasingScore integrateReadScore(PhasingScore score) { + timesEqual(score); + return this; + } +} + +class HaplotypeClass implements Iterable { + private ArrayList haps; + private Haplotype rep; + + public HaplotypeClass(ArrayList haps, Haplotype rep) { + this.haps = haps; + this.rep = rep; + } + + public Iterator iterator() { + return haps.iterator(); + } + + public Haplotype getRepresentative() { + return rep; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + for (Haplotype h : haps) { + if (isFirst) + isFirst = false; + else + sb.append(" + "); + + sb.append(h); + } + sb.append(" [").append(rep).append("]"); + return sb.toString(); + } +} + +// Summary statistics about phasing rates, for each sample +class PhasingStats { + private int numReads; + private int numVarSites; + + // Map of: sample -> PhaseCounts: + private Map samplePhaseStats; + + public PhasingStats() { + this(new TreeMap()); + } + + public PhasingStats(int numReads, int numVarSites) { + this.numReads = numReads; + this.numVarSites = numVarSites; + this.samplePhaseStats = new TreeMap(); + } + + public PhasingStats(Map samplePhaseStats) { + this.numReads = 0; + this.numVarSites = 0; + this.samplePhaseStats = samplePhaseStats; + } + + public void addIn(PhasingStats other) { + this.numReads += other.numReads; + this.numVarSites += other.numVarSites; + + for (Map.Entry sampPhaseEntry : other.samplePhaseStats.entrySet()) { + String sample = sampPhaseEntry.getKey(); + PhaseCounts otherCounts = sampPhaseEntry.getValue(); + PhaseCounts thisCounts = this.samplePhaseStats.get(sample); + if (thisCounts == null) { + thisCounts = new PhaseCounts(); + this.samplePhaseStats.put(sample, thisCounts); + } + thisCounts.addIn(otherCounts); + } + } + + public int getNumReads() { + return numReads; + } + + public int getNumVarSites() { + return numVarSites; + } + + public Collection> getPhaseCounts() { + return samplePhaseStats.entrySet(); + } +} + +class PhaseCounts { + public int numTestedSites; // number of het sites directly succeeding het sites + public int numInconsistentSitesPhased; + public int numInconsistentSitesNotPhased; + public int numPhased; + + public PhaseCounts() { + this.numTestedSites = 0; + this.numInconsistentSitesPhased = 0; + this.numInconsistentSitesNotPhased = 0; + this.numPhased = 0; + } + + public void addIn(PhaseCounts other) { + this.numTestedSites += other.numTestedSites; + this.numInconsistentSitesPhased += other.numInconsistentSitesPhased; + this.numInconsistentSitesNotPhased += other.numInconsistentSitesNotPhased; + this.numPhased += other.numPhased; + } +} + +class PhasingStatsAndOutput { + public PhasingStats ps; + public List output; + + public PhasingStatsAndOutput(PhasingStats ps, List output) { + this.ps = ps; + this.output = output; + } +} + +class PhasingQualityStatsWriter { + private String variantStatsFilePrefix; + private HashMap sampleToStatsWriter = new HashMap(); + + public PhasingQualityStatsWriter(String variantStatsFilePrefix) { + this.variantStatsFilePrefix = variantStatsFilePrefix; + } + + public void addStat(String sample, GenomeLoc locus, int startDistanceFromPrevious, double phasingQuality, int numReads, int windowSize) { + BufferedWriter sampWriter = sampleToStatsWriter.get(sample); + if (sampWriter == null) { + String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ_numReads_windowSize.txt"; + + FileOutputStream output; + try { + output = new FileOutputStream(fileName); + } catch (FileNotFoundException e) { + throw new RuntimeException("Unable to create phasing quality stats file at location: " + fileName); + } + sampWriter = new BufferedWriter(new OutputStreamWriter(output)); + sampleToStatsWriter.put(sample, sampWriter); + } + try { + sampWriter.write(locus + "\t" + startDistanceFromPrevious + "\t" + phasingQuality + "\t" + numReads + "\t" + windowSize + "\n"); + sampWriter.flush(); + } catch (IOException e) { + throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e); + } + } + + public void close() { + for (Map.Entry sampWriterEntry : sampleToStatsWriter.entrySet()) { + BufferedWriter sampWriter = sampWriterEntry.getValue(); + try { + sampWriter.flush(); + sampWriter.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close per-sample phasing quality stats file"); + } + } + } +} + +class SampleReadLocus implements Comparable { + private String sample; + private String read; + private GenomeLoc locus; + + public SampleReadLocus(String sample, String read, GenomeLoc locus) { + this.sample = sample; + this.read = read; + this.locus = locus; + } + + public GenomeLoc getLocus() { + return locus; + } + + public int compareTo(SampleReadLocus that) { + int comp = this.sample.compareTo(that.sample); + if (comp != 0) + return comp; + + comp = this.read.compareTo(that.read); + if (comp != 0) + return comp; + + return this.locus.compareTo(that.locus); + } + + public String toString() { + return "Sample " + sample + ", read " + read + ", locus " + locus; + } +} + +class MultipleBaseCounts { + private Map baseCounts; + private GenomeLoc phasingLocus; + + public MultipleBaseCounts(GenomeLoc phasingLoc) { + this.baseCounts = new HashMap(); + this.phasingLocus = phasingLoc; + } + + public boolean samePhasingLocAs(GenomeLoc loc) { + return phasingLocus.equals(loc); + } + + public void incrementBaseCount(byte base) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + Integer cnt = baseCounts.get(baseIndex); + if (cnt == null) + cnt = 0; + + baseCounts.put(baseIndex, cnt + 1); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + + sb.append("Base counts"); + for (Map.Entry baseCountEntry : baseCounts.entrySet()) { + byte base = BaseUtils.baseIndexToSimpleBase(baseCountEntry.getKey()); + int cnt = baseCountEntry.getValue(); + sb.append("\t" + (char) base + ": " + cnt); + } + + return sb.toString(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java new file mode 100644 index 000000000..581a9e426 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java @@ -0,0 +1,380 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMRecordCoordinateComparator; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * The class manages reads and splices and tries to apply overhang clipping when appropriate. + * Important note: although for efficiency the manager does try to send reads to the underlying writer in coordinate + * sorted order, it does NOT guarantee that it will do so in every case! So unless there's a good reason not to, + * methods that instantiate this manager should pass in a writer that does not assume the reads are pre-sorted. + */ +public class OverhangFixingManager { + + protected static final Logger logger = Logger.getLogger(OverhangFixingManager.class); + private static final boolean DEBUG = false; + + // how many reads should we store in memory before flushing the queue? + private final int MAX_RECORDS_IN_MEMORY; + + // how many mismatches do we tolerate in the overhangs? + private final int MAX_MISMATCHES_IN_OVERHANG; + + // how many bases do we tolerate in the overhang before deciding not to clip? + private final int MAX_BASES_IN_OVERHANG; + + // should we not bother fixing overhangs? + private final boolean doNotFixOverhangs; + + // where we ultimately write out our records + private final SAMFileWriter writer; + + // fasta reference reader to check overhanging edges in the exome reference sequence + private final CachingIndexedFastaSequenceFile referenceReader; + + // the genome loc parser + private final GenomeLocParser genomeLocParser; + + // the read cache + private final static int initialCapacity = 5000; + private PriorityQueue waitingReads = new PriorityQueue<>(initialCapacity, new SplitReadComparator()); + + // the set of current splices to use + private final Set splices = new TreeSet<>(new SpliceComparator()); + + protected static final int MAX_SPLICES_TO_KEEP = 1000; + + + /** + * + * @param writer actual writer + * @param genomeLocParser the GenomeLocParser object + * @param referenceReader the reference reader + * @param maxRecordsInMemory max records to keep in memory + * @param maxMismatchesInOverhangs max number of mismatches permitted in the overhangs before requiring clipping + * @param maxBasesInOverhangs max number of bases permitted in the overhangs before deciding not to clip + * @param doNotFixOverhangs if true, don't clip overhangs at all + */ + public OverhangFixingManager(final SAMFileWriter writer, + final GenomeLocParser genomeLocParser, + final CachingIndexedFastaSequenceFile referenceReader, + final int maxRecordsInMemory, + final int maxMismatchesInOverhangs, + final int maxBasesInOverhangs, + final boolean doNotFixOverhangs) { + this.writer = writer; + this.genomeLocParser = genomeLocParser; + this.referenceReader = referenceReader; + this.MAX_RECORDS_IN_MEMORY = maxRecordsInMemory; + this.MAX_MISMATCHES_IN_OVERHANG = maxMismatchesInOverhangs; + this.MAX_BASES_IN_OVERHANG = maxBasesInOverhangs; + this.doNotFixOverhangs = doNotFixOverhangs; + } + + public final int getNReadsInQueue() { return waitingReads.size(); } + + /** + * For testing purposes only + * + * @return the list of reads currently in the queue + */ + protected List getReadsInQueueForTesting() { + return new ArrayList<>(waitingReads); + } + + /** + * For testing purposes only + * + * @return the list of splices currently in the queue + */ + protected List getSplicesForTesting() { + return new ArrayList<>(splices); + } + + /** + * Add a new observed split to the list to use + * + * @param contig the contig + * @param start the start of the split, inclusive + * @param end the end of the split, inclusive + */ + public void addSplicePosition(final String contig, final int start, final int end) { + if ( doNotFixOverhangs ) + return; + + // is this a new splice? if not, we are done + final Splice splice = new Splice(contig, start, end); + if ( splices.contains(splice) ) + return; + + // initialize it with the reference context + // we don't want to do this until we know for sure that it's a new splice position + splice.initialize(referenceReader); + + // clear the set of old split positions seen if we hit a new contig + final boolean sameContig = splices.isEmpty() || splices.iterator().next().loc.getContig().equals(contig); + if ( !sameContig ) + splices.clear(); + + // run this position against the existing reads + for ( final SplitRead read : waitingReads ) + fixSplit(read, splice); + + splices.add(splice); + + if ( splices.size() > MAX_SPLICES_TO_KEEP ) + cleanSplices(); + } + + /** + * Add a read to the manager + * + * @param read the read to add + */ + public void addRead(final GATKSAMRecord read) { + if ( read == null ) throw new IllegalArgumentException("read added to manager is null, which is not allowed"); + + // if the new read is on a different contig or we have too many reads, then we need to flush the queue and clear the map + final boolean tooManyReads = getNReadsInQueue() >= MAX_RECORDS_IN_MEMORY; + final boolean encounteredNewContig = getNReadsInQueue() > 0 && !waitingReads.peek().read.getReferenceIndex().equals(read.getReferenceIndex()); + + if ( tooManyReads || encounteredNewContig ) { + if ( DEBUG ) logger.warn("Flushing queue on " + (tooManyReads ? "too many reads" : ("move to new contig: " + read.getReferenceName() + " from " + waitingReads.peek().read.getReferenceName())) + " at " + read.getAlignmentStart()); + + final int targetQueueSize = encounteredNewContig ? 0 : MAX_RECORDS_IN_MEMORY / 2; + + // write the required number of waiting reads to disk + while ( getNReadsInQueue() > targetQueueSize ) + writer.addAlignment(waitingReads.poll().read); + } + + final SplitRead splitRead = new SplitRead(read); + + // fix overhangs, as needed + for ( final Splice splice : splices) + fixSplit(splitRead, splice); + + // add the new read to the queue + waitingReads.add(splitRead); + } + + /** + * Clean up the list of splices + */ + private void cleanSplices() { + final int targetQueueSize = splices.size() / 2; + final Iterator iter = splices.iterator(); + for ( int i = 0; i < targetQueueSize; i++ ) { + iter.next(); + iter.remove(); + } + } + + /** + * Try to fix the given read using the given split + * + * @param read the read to fix + * @param splice the split (bad region to clip out) + */ + private void fixSplit(final SplitRead read, final Splice splice) { + // if the read doesn't even overlap the split position then we can just exit + if ( !splice.loc.overlapsP(read.loc) ) + return; + + if ( isLeftOverhang(read.loc, splice.loc) ) { + final int overhang = splice.loc.getStop() - read.loc.getStart() + 1; + if ( overhangingBasesMismatch(read.read.getReadBases(), 0, splice.reference, splice.reference.length - overhang, overhang) ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipByReadCoordinates(read.read, 0, overhang - 1); + read.setRead(clippedRead); + } + } + else if ( isRightOverhang(read.loc, splice.loc) ) { + final int overhang = read.loc.getStop() - splice.loc.getStart() + 1; + if ( overhangingBasesMismatch(read.read.getReadBases(), read.read.getReadLength() - overhang, splice.reference, 0, overhang) ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipByReadCoordinates(read.read, read.read.getReadLength() - overhang, read.read.getReadLength() - 1); + read.setRead(clippedRead); + } + } + } + + /** + * Is this a proper overhang on the left side of the read? + * + * @param readLoc the read's loc + * @param spliceLoc the split's loc + * @return true if it's a left side overhang + */ + protected static boolean isLeftOverhang(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + return readLoc.getStart() <= spliceLoc.getStop() && readLoc.getStart() > spliceLoc.getStart() && readLoc.getStop() > spliceLoc.getStop(); + } + + /** + * Is this a proper overhang on the right side of the read? + * + * @param readLoc the read's loc + * @param spliceLoc the split's loc + * @return true if it's a right side overhang + */ + protected static boolean isRightOverhang(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + return readLoc.getStop() >= spliceLoc.getStart() && readLoc.getStop() < spliceLoc.getStop() && readLoc.getStart() < spliceLoc.getStart(); + } + + /** + * Are there too many mismatches to the reference among the overhanging bases? + * + * @param read the read bases + * @param readStartIndex where to start on the read + * @param reference the reference bases + * @param referenceStartIndex where to start on the reference + * @param spanToTest how many bases to test + * @return true if too many overhanging bases mismatch, false otherwise + */ + protected boolean overhangingBasesMismatch(final byte[] read, + final int readStartIndex, + final byte[] reference, + final int referenceStartIndex, + final int spanToTest) { + // don't process too small a span, too large a span, or a span that is most of a read + if ( spanToTest < 1 || spanToTest > MAX_BASES_IN_OVERHANG || spanToTest > read.length / 2 ) + return false; + + int numMismatchesSeen = 0; + for ( int i = 0; i < spanToTest; i++ ) { + if ( read[readStartIndex + i] != reference[referenceStartIndex + i] ) { + if ( ++numMismatchesSeen > MAX_MISMATCHES_IN_OVERHANG ) + return true; + } + } + + // we can still mismatch overall if at least half of the bases mismatch + return numMismatchesSeen >= ((spanToTest+1)/2); + } + + /** + * Close out the manager stream by clearing the read cache + */ + public void close() { + // write out all of the remaining reads + while ( ! waitingReads.isEmpty() ) + writer.addAlignment(waitingReads.poll().read); + } + + // class to represent the reads with their soft-clip-included GenomeLocs + protected final class SplitRead { + + public GATKSAMRecord read; + public GenomeLoc loc; + + public SplitRead(final GATKSAMRecord read) { + setRead(read); + } + + public void setRead(final GATKSAMRecord read) { + if ( !read.isEmpty() ) { + this.read = read; + loc = genomeLocParser.createGenomeLoc(read.getReferenceName(), read.getSoftStart(), read.getSoftEnd()); + } + } + } + + // class to represent the comparator for the split reads + private final class SplitReadComparator implements Comparator { + + private final SAMRecordCoordinateComparator readComparator; + + public SplitReadComparator() { + readComparator = new SAMRecordCoordinateComparator(); + } + + public int compare(final SplitRead read1, final SplitRead read2) { + return readComparator.compare(read1.read, read2.read); + } + } + + // class to represent the split positions + protected final class Splice { + + public final GenomeLoc loc; + public byte[] reference; + + public Splice(final String contig, final int start, final int end) { + loc = genomeLocParser.createGenomeLoc(contig, start, end); + } + + public void initialize(final CachingIndexedFastaSequenceFile referenceReader) { + reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + } + + @Override + public boolean equals(final Object other) { + return other != null && (other instanceof Splice) && this.loc.equals(((Splice)other).loc); + } + + @Override + public int hashCode() { + return loc.hashCode(); + } + } + + // class to represent the comparator for the split reads + private final class SpliceComparator implements Comparator { + + public int compare(final Splice position1, final Splice position2) { + return position1.loc.compareTo(position2.loc); + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java new file mode 100644 index 000000000..6b9fca312 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java @@ -0,0 +1,242 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.FileNotFoundException; + +/** + * + * Splits reads that contain Ns in their cigar string (e.g. spanning splicing events). + * + * Identifies all N cigar elements and creates k+1 new reads (where k is the number of N cigar elements). + * The first read includes the bases that are to the left of the first N element, while the part of the read that is to the right of the N + * (including the Ns) is hard clipped and so on for the rest of the new reads. + * + * + * User: ami + * Date: 11/14/13 + * Time: 11:52 AM + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class SplitNCigarReads extends ReadWalker { + + // The name that will go in the @PG tag + public static final String PROGRAM_RECORD_NAME = "GATK SplitNCigarReads"; + + + @Output(doc="Write output to this BAM filename instead of STDOUT") + protected StingSAMFileWriter writer; + + /** + * For expert users only! To minimize memory consumption you can lower this number, but then the tool may skip + * overhang fixing in regions with too much coverage. Just make sure to give Java enough memory! 4Gb should be + * enough with the default value. + */ + @Advanced + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the BAM writer", required=false) + protected int MAX_RECORDS_IN_MEMORY = 150000; + + /** + * If there are more than this many mismatches within the overhang regions, the whole overhang will get hard-clipped out. + * It is still possible in some cases that the overhang could get clipped if the number of mismatches do not exceed this + * value, e.g. if most of the overhang mismatches. + */ + @Advanced + @Argument(fullName="maxMismatchesInOverhang", shortName="maxMismatches", doc="max number of mismatches allowed in the overhang", required=false) + protected int MAX_MISMATCHES_IN_OVERHANG = 1; + + /** + * If there are more than this many bases in the overhang, we won't try to hard-clip them out + */ + @Advanced + @Argument(fullName="maxBasesInOverhang", shortName="maxOverhang", doc="max number of bases allowed in the overhang", required=false) + protected int MAX_BASES_TO_CLIP = 40; + + @Argument(fullName="doNotFixOverhangs", shortName="doNotFixOverhangs", doc="do not have the walker hard-clip overhanging sections of the reads", required=false) + protected boolean doNotFixOverhangs = false; + + @Hidden + @Argument(fullName = "no_pg_tag", shortName = "npt", doc = "Necessary for integration tests", required = false) + protected boolean NO_PG_TAG = false; + + /** + * This stores all of the already-split reads and manages any processing (e.g. clipping overhangs) that happens to them. + * It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class. + */ + protected OverhangFixingManager overhangManager; + + + @Override + public void initialize() { + final GenomeAnalysisEngine toolkit = getToolkit(); + + if ( !NO_PG_TAG ) { + // we don't want to assume that reads will be written in order by the manager because in deep, deep pileups it won't work + Utils.setupWriter(writer, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME); + } + + try { + final CachingIndexedFastaSequenceFile referenceReader = new CachingIndexedFastaSequenceFile(toolkit.getArguments().referenceFile); + overhangManager = new OverhangFixingManager(writer, toolkit.getGenomeLocParser(), referenceReader, MAX_RECORDS_IN_MEMORY, MAX_MISMATCHES_IN_OVERHANG, MAX_BASES_TO_CLIP, doNotFixOverhangs); + } + catch (FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(toolkit.getArguments().referenceFile, ex); + } + } + + @Override + public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) { + return read; + } + + @Override + public OverhangFixingManager reduceInit() { + return overhangManager; + } + + @Override + public OverhangFixingManager reduce(final GATKSAMRecord read, final OverhangFixingManager manager) { + splitNCigarRead(read, manager); + return manager; + } + + @Override + public void onTraversalDone(final OverhangFixingManager manager) { + manager.close(); + } + + /** + * Goes through the cigar string of the read and create new reads for each consecutive non-N elements (while hard clipping the rest of the read). + * For example: for a read with cigar '1H2M2D1M2N1M2I1N1M2S' 3 new reads will be created with cigar strings: 1H2M2D1M, 1M2I and 1M2S + * + * @param read the read to split + * @param manager the output manager + */ + public static void splitNCigarRead(final GATKSAMRecord read, final OverhangFixingManager manager) { + final int numCigarElements = read.getCigar().numCigarElements(); + + int firstCigarIndex = 0; + for ( int i = 0; i < numCigarElements; i++ ) { + final CigarElement cigarElement = read.getCigar().getCigarElement(i); + if (cigarElement.getOperator() == CigarOperator.N) { + manager.addRead(splitReadBasedOnCigar(read, firstCigarIndex, i, manager)); + firstCigarIndex = i+1; + } + } + + // if there are no N's in the read + if (firstCigarIndex == 0) { + manager.addRead(read); + } + //add the last section of the read: from the last N to the the end of the read + // (it will be done for all the usual cigar string that does not end with N) + else if (firstCigarIndex < numCigarElements) { + manager.addRead(splitReadBasedOnCigar(read, firstCigarIndex, numCigarElements, null)); + } + } + + /** + * Pull out an individual split position for a read + * + * @param read the read being split + * @param cigarStartIndex the index of the first cigar element to keep + * @param cigarEndIndex the index of the last cigar element to keep + * @param forSplitPositions the manager for keeping track of split positions; can be null + * @return a non-null read representing the section of the original read being split out + */ + private static GATKSAMRecord splitReadBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex, final OverhangFixingManager forSplitPositions) { + int cigarFirstIndex = cigarStartIndex; + int cigarSecondIndex = cigarEndIndex; + + //in case a section of the read ends or starts with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element + // it can be 'if', but it was kept as 'while' to make sure the code can work with Cigar strings that were not "cleaned" + while(read.getCigar().getCigarElement(cigarFirstIndex).getOperator().equals(CigarOperator.D)) + cigarFirstIndex++; + while(read.getCigar().getCigarElement(cigarSecondIndex-1).getOperator().equals(CigarOperator.D)) + cigarSecondIndex--; + if(cigarFirstIndex > cigarSecondIndex) + throw new UserException.BadInput("Cannot split this read (might be an empty section between Ns, for example 1N1D1N): "+read.getCigarString()); + + // we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive). + // the other sections of the read are clipped: + final int startRefIndex = read.getOriginalAlignmentStart() + CigarUtils.countRefBasesBasedOnCigar(read,0,cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index. + final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index. + + if ( forSplitPositions != null ) { + final String contig = read.getReferenceName(); + final int splitStart = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end. + final int splitEnd = splitStart + read.getCigar().getCigarElement(cigarEndIndex).getLength() - 1; + forSplitPositions.addSplicePosition(contig, splitStart, splitEnd); + } + + return ReadClipper.hardClipToRegionIncludingClippedBases(read, startRefIndex, stopRefIndex); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java new file mode 100644 index 000000000..7054d78cd --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java @@ -0,0 +1,395 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.simulatereads; + +import cern.jet.random.Poisson; +import cern.jet.random.engine.MersenneTwister; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; + +import java.util.*; + +/** + * Generates simulated reads for variants + * + *

Given a set of variants, this tool will generate simulated reads that support the input variants.

+ * + *

Caveats

+ *

For practical reasons, only bi-allelic variants that are not too close to the ends of contigs (< 1/2 read length) are supported; all others will simply be ignored.

+ * + *

Input

+ *

A VCF file containing variants.

+ * + *

Output

+ *

A BAM file containing simulated sequence reads that support the input variants, with the requested error rate and coverage depth.

+ * + *

Example

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -T SimulateReadsForVariants \
+ *   -R reference.fasta \
+ *   -V input_variants.vcf \
+ *   -o simulated_reads.bam \
+ *   --readDepth 50 \
+ *   --errorRate 25
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.EB) + +@Reference(window=@Window(start=-200,stop=200)) +public class SimulateReadsForVariants extends RodWalker { + + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * The simulated reads will be written to a BAM file. + */ + @Output(doc="Reads corresponding to variants", required=true) + protected StingSAMFileWriter readWriter; + /** + * Use this argument to set the desired target read depth. See the readSamplingMode argument for options that + * determine whether coverage distribution will be exactly this value or an approximation. + */ + @Argument(fullName="readDepth", shortName="DP", doc="Read depth to generate", required=false, minValue = 0, minRecommendedValue = 1, maxRecommendedValue = 1000, maxValue = Integer.MAX_VALUE) + public int readDepth = 20; + /** + * Errors will be generated at this rate in the simulated reads. Base qualities are therefore also assigned this value. + */ + @Argument(fullName="errorRate", shortName="ER", doc="Base error rate (Phred-scaled)", required=false, minValue = 0, maxValue = Integer.MAX_VALUE) + public int phredErrorRate = 20; + /** + * All simulated reads will be exactly this length. + */ + @Argument(fullName="readLength", shortName="RL", doc="Read lengths (bp)", required=false, minValue = 1, maxValue = Integer.MAX_VALUE) + public int readLength = 101; + /** + * The corresponding platform identifier will be specified in the simulated read group PL tag. This setting does not + * affect the properties of the simulated reads. + */ + @Advanced + @Argument(fullName="rgPlatform", shortName="RGPL", doc="Sequencing platform", required=false) + public NGSPlatform rgPlatform = NGSPlatform.ILLUMINA; + /** + * This determines how read sampling is achieved, and affects the coverage distribution of simulated reads. + * CONSTANT sampling will produce uniform depth at all positions, while POISSON sampling will produce a + * distribution of coverages around the requested value. + */ + @Advanced + @Argument(fullName="readSamplingMode", shortName="RSM", doc="Sampling mode", required=false) + public ReadSamplingMode samplingMode = ReadSamplingMode.CONSTANT; + public enum ReadSamplingMode { CONSTANT, POISSON }; + + @Hidden + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags, for integration tests", required=false) + public boolean NO_PG_TAG = false; + + @Hidden + @Argument(fullName="verbose", shortName="verbose", doc="Verbose", required=false) + public boolean verbose = false; + + public static final String PROGRAM_RECORD_NAME = "GATK SimulateReadsForVariants"; + + // variables used to store state + private long readNameCounter = 1; + private int halfReadLength; + private double errorRate; + private byte[] readQuals; + private SAMFileHeader header = null; + + // randomness related variables + private static final long RANDOM_SEED = 1252863495; + private static final Random ran = GenomeAnalysisEngine.getRandomGenerator(); + private Poisson poissonRandom = null; + + // samples and read groups + private final Map sample2RG = new HashMap(); + + private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } + + private SAMReadGroupRecord createRG(String name) { + SAMReadGroupRecord rg = new SAMReadGroupRecord(name); + rg.setPlatform(rgPlatform.getDefaultPlatform()); + rg.setSample(name); + return rg; + } + + // class to store the bases, offset, and representative CIGAR of a haplotype + private static class ArtificialHaplotype { + public final byte[] bases; + public final int offset; + public final String cigar; + + public ArtificialHaplotype(final byte[] bases, final int offset, final String cigar) { + this.bases = bases; + this.offset = offset; + this.cigar = cigar; + } + } + + @Override + public void initialize() { + + // initialize sample -> read group map + final List sampleRGs = new ArrayList(); + for ( final String sample : SampleUtils.getUniqueSamplesFromRods(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { + final SAMReadGroupRecord rg = createRG(sample); + sampleRGs.add(rg); + sample2RG.put(sample, rg); + } + + // initialize BAM headers + header = new SAMFileHeader(); + header.setSequenceDictionary(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary()); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + header.setReadGroups(sampleRGs); + + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + if ( !NO_PG_TAG ) { + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + programRecord.setProgramVersion(headerInfo.getString("org.broadinstitute.sting.gatk.version")); + programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + } + header.setProgramRecords(Arrays.asList(programRecord)); + + readWriter.setPresorted(false); + readWriter.writeHeader(header); + + halfReadLength = readLength / 2; + errorRate = QualityUtils.qualToErrorProb((byte)phredErrorRate); + readQuals = new byte[readLength]; + Arrays.fill(readQuals, (byte)phredErrorRate); + if ( samplingMode == ReadSamplingMode.POISSON ) + poissonRandom = new Poisson(readDepth, new MersenneTwister((int)RANDOM_SEED)); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return 0; + + if ( ref.getLocus().getStart() < readLength || ! BaseUtils.isRegularBase(ref.getBase()) ) + return 0; + + final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); + if ( vc == null || !vc.isBiallelic() ) + return 0; + + if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); + + generateReadsForVariant(vc, ref); + + return 1; + } + + /** + * Contstructs an artifical haplotype given an allele and original reference context + * + * @param allele the allele to model (can be reference) + * @param refLength the length of the reference allele + * @param ref the original reference context + * @return a non-null ArtificialHaplotype + */ + private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) { + + final byte[] haplotype = new byte[readLength]; + + final int alleleLength = allele.getBases().length; + final int halfAlleleLength = (alleleLength + 1) / 2; + + // this is how far back to move from the event to start copying bases + final int offset = halfReadLength - halfAlleleLength; + + // copy bases before the event + final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart()); + int posOnRefContext = locusPosOnRefContext - offset; + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset); + int copiedCount = offset; + + // copy the event bases + System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength); + copiedCount += alleleLength; + + // copy bases after the event + posOnRefContext = locusPosOnRefContext + refLength; + final int remainder = readLength - copiedCount; + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder); + + final String cigar; + if ( refLength == alleleLength ) + cigar = readLength + "M"; + else + cigar = (offset + 1) + "M" + Math.abs(refLength - alleleLength) + (refLength > alleleLength ? "D" : "I") + remainder + "M"; + + return new ArtificialHaplotype(haplotype, offset, cigar); + } + + /** + * Generates the artificial reads for a given variant + * + * @param vc the (bi-allelic) variant context for which to generate artificial reads + * @param ref the original reference context + */ + private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref) { + + final int refLength = vc.getReference().getBases().length; + final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); + final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); + + int gi = 0; + for ( final Genotype g : vc.getGenotypes() ) { + final int myDepth = sampleDepth(); + for ( int d = 0; d < myDepth; d++ ) { + + final ArtificialHaplotype haplotype = chooseRefHaplotype(g) ? refHap : altHap; + final byte[] readBases = Arrays.copyOf(haplotype.bases, readLength); + + addMachineErrors(readBases, errorRate); + writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0); + } + } + } + + /** + * Decides whether or not to choose the reference haplotype, depending on the given genotype + * + * @param g the genotype of the given sample + * @return true if one should use the reference haplotype, false otherwise + */ + private boolean chooseRefHaplotype(final Genotype g) { + final double refP; + if ( g.isHomRef() ) refP = 1; + else if ( g.isHet() ) refP = 0.5; + else refP = 0.0; + + return ran.nextDouble() < refP; + } + + /** + * Generates the artificial read depth + * + * @return a non-negative int + */ + private int sampleDepth() { + switch ( samplingMode ) { + case CONSTANT: return readDepth; + case POISSON: return poissonRandom.nextInt(); + default: + throw new IllegalStateException("Unexpected DepthSamplingType " + samplingMode); + } + } + + /** + * Creates and writes an artificial read given the appropriate data + * + * @param readBases the bases + * @param contig the contig + * @param start the read start + * @param cigar the cigar string + * @param sample the sample name (used to get the right read group) + * @param isNegStrand should this read be on the negative strand? + */ + private void writeRead(final byte[] readBases, final String contig, final int start, + final String cigar, final String sample, final boolean isNegStrand) { + final GATKSAMRecord read = new GATKSAMRecord(header); + read.setBaseQualities(readQuals); + read.setReadBases(readBases); + read.setReadName("" + readNameCounter++); + read.setCigarString(cigar); + read.setReadPairedFlag(false); + read.setAlignmentStart(start); + read.setMappingQuality(60); + read.setReferenceName(contig); + read.setReadNegativeStrandFlag(isNegStrand); + read.setAttribute("RG", sampleRG(sample).getReadGroupId()); + + readWriter.addAlignment(read); + } + + /** + * Adds machine errors at the appropriate rate to the provided read bases + * + * @param readBases the read bases + * @param errorRate the rate at which to produce errors + */ + private void addMachineErrors(final byte[] readBases, final double errorRate) { + for ( int i = 0; i < readBases.length; i++ ) { + final double r = ran.nextDouble(); + if ( r < errorRate ) { + byte errorBase = BaseUtils.baseIndexToSimpleBase(BaseUtils.getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(readBases[i]))); + if ( errorBase == readBases[i] ) throw new IllegalStateException("Read and error bases are the same"); + readBases[i] = errorBase; + } + } + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java new file mode 100644 index 000000000..3741ce12d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -0,0 +1,215 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 10, 2011 + */ + +public class Tranche { + private static final int CURRENT_VERSION = 5; + + public double ts, minVQSLod, knownTiTv, novelTiTv; + public int numKnown,numNovel; + public String name; + public VariantRecalibratorArgumentCollection.Mode model; + + int accessibleTruthSites = 0; + int callsAtTruthSites = 0; + + public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model) { + this(ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, model, "anonymous"); + } + + public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model, String name ) { + this.ts = ts; + this.minVQSLod = minVQSLod; + this.novelTiTv = novelTiTv; + this.numNovel = numNovel; + this.knownTiTv = knownTiTv; + this.numKnown = numKnown; + this.model = model; + this.name = name; + + this.accessibleTruthSites = accessibleTruthSites; + this.callsAtTruthSites = callsAtTruthSites; + + if ( ts < 0.0 || ts > 100.0) + throw new UserException("Target FDR is unreasonable " + ts); + + if ( numKnown < 0 || numNovel < 0) + throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel); + + if ( name == null ) + throw new ReviewedStingException("BUG -- name cannot be null"); + } + + private double getTruthSensitivity() { + return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; + } + + public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { + @Override + public int compare(final Tranche tranche1, final Tranche tranche2) { + return Double.compare(tranche1.ts, tranche2.ts); + } + } + + @Override + public String toString() { + return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", + ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); + } + + /** + * Returns an appropriately formatted string representing the raw tranches file on disk. + * + * @param tranches + * @return + */ + public static String tranchesString( final List tranches ) { + final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + final PrintStream stream = new PrintStream(bytes); + + if( tranches.size() > 1 ) + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); + + stream.println("# Variant quality score tranches file"); + stream.println("# Version number " + CURRENT_VERSION); + stream.println("targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity"); + + Tranche prev = null; + for ( Tranche t : tranches ) { + stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,VQSRTranche%s%.2fto%.2f,%s,%d,%d,%.4f%n", + t.ts, t.numKnown, t.numNovel, t.knownTiTv, t.novelTiTv, t.minVQSLod, t.model.toString(), + (prev == null ? 0.0 : prev.ts), t.ts, t.model.toString(), t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity()); + prev = t; + } + + return bytes.toString(); + } + + private static double getDouble(Map bindings, String key, boolean required) { + if ( bindings.containsKey(key) ) { + String val = bindings.get(key); + return Double.valueOf(val); + } + else if ( required ) { + throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); + } + else + return -1; + } + + private static int getInteger(Map bindings, String key, boolean required) { + if ( bindings.containsKey(key) ) + return Integer.valueOf(bindings.get(key)); + else if ( required ) { + throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); + } + else + return -1; + } + + /** + * Returns a list of tranches, sorted from most to least specific, read in from file f + * + * @param f + * @return + */ + public static List readTranches(File f) { + String[] header = null; + List tranches = new ArrayList(); + + try { + for( final String line : new XReadLines(f) ) { + if ( line.startsWith("#") ) + continue; + + final String[] vals = line.split(","); + if( header == null ) { + header = vals; + if ( header.length == 5 || header.length == 8 || header.length == 10 ) + // old style tranches file, throw an error + throw new UserException.MalformedFile(f, "Unfortunately your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator"); + if ( header.length != 11 ) + throw new UserException.MalformedFile(f, "Expected 11 elements in header line " + line); + } else { + if ( header.length != vals.length ) + throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + ". The line was: " + line); + + Map bindings = new HashMap(); + for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]); + tranches.add(new Tranche(getDouble(bindings,"targetTruthSensitivity", true), + getDouble(bindings,"minVQSLod", true), + getInteger(bindings,"numKnown", false), + getDouble(bindings,"knownTiTv", false), + getInteger(bindings,"numNovel", true), + getDouble(bindings,"novelTiTv", true), + getInteger(bindings,"accessibleTruthSites", false), + getInteger(bindings,"callsAtTruthSites", false), + VariantRecalibratorArgumentCollection.parseString(bindings.get("model")), + bindings.get("filterName"))); + } + } + + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); + return tranches; + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(f, e); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java new file mode 100644 index 000000000..1f355359d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -0,0 +1,433 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import it.unimi.dsi.fastutil.booleans.BooleanLists; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 4, 2011 + */ + +public class VariantDataManager { + private List data = Collections.emptyList(); + private double[] meanVector; + private double[] varianceVector; // this is really the standard deviation + public List annotationKeys; + private final VariantRecalibratorArgumentCollection VRAC; + protected final static Logger logger = Logger.getLogger(VariantDataManager.class); + protected final List trainingSets; + + public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { + this.data = Collections.emptyList(); + this.annotationKeys = new ArrayList<>( annotationKeys ); + this.VRAC = VRAC; + meanVector = new double[this.annotationKeys.size()]; + varianceVector = new double[this.annotationKeys.size()]; + trainingSets = new ArrayList<>(); + } + + public void setData( final List data ) { + this.data = data; + } + + public List getData() { + return data; + } + + public void normalizeData() { + boolean foundZeroVarianceAnnotation = false; + for( int iii = 0; iii < meanVector.length; iii++ ) { + final double theMean = mean(iii, true); + final double theSTD = standardDeviation(theMean, iii, true); + logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); + if( Double.isNaN(theMean) ) { + throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpConstants.forumPost("discussion/49/using-variant-annotator")); + } + + foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-5); + meanVector[iii] = theMean; + varianceVector[iii] = theSTD; + for( final VariantDatum datum : data ) { + // Transform each data point via: (x - mean) / standard deviation + datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); + } + } + if( foundZeroVarianceAnnotation ) { + throw new UserException.BadInput( "Found annotations with zero variance. They must be excluded before proceeding." ); + } + + // trim data by standard deviation threshold and mark failing data for exclusion later + for( final VariantDatum datum : data ) { + boolean remove = false; + for( final double val : datum.annotations ) { + remove = remove || (Math.abs(val) > VRAC.STD_THRESHOLD); + } + datum.failingSTDThreshold = remove; + } + + // re-order the data by increasing standard deviation so that the results don't depend on the order things were specified on the command line + // standard deviation over the training points is used as a simple proxy for information content, perhaps there is a better thing to use here + final List theOrder = calculateSortOrder(meanVector); + annotationKeys = reorderList(annotationKeys, theOrder); + varianceVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(varianceVector), theOrder)); + meanVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(meanVector), theOrder)); + for( final VariantDatum datum : data ) { + datum.annotations = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.annotations), theOrder)); + datum.isNull = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.isNull), theOrder)); + } + logger.info("Annotations are now ordered by their information content: " + annotationKeys.toString()); + } + + /** + * Get a list of indices which give the ascending sort order of the data array + * @param inputVector the data to consider + * @return a non-null list of integers with length matching the length of the input array + */ + protected List calculateSortOrder(final double[] inputVector) { + final List theOrder = new ArrayList<>(inputVector.length); + final List toBeSorted = new ArrayList<>(inputVector.length); + int count = 0; + for( int iii = 0; iii < inputVector.length; iii++ ) { + toBeSorted.add(new MyDoubleForSorting(-1.0 * Math.abs(inputVector[iii] - mean(iii, false)), count++)); + } + Collections.sort(toBeSorted); + for( final MyDoubleForSorting d : toBeSorted ) { + theOrder.add(d.originalIndex); // read off the sort order by looking at the index field + } + return theOrder; + } + + // small private class to assist in reading off the new ordering of the annotation array + private class MyDoubleForSorting implements Comparable { + final Double myData; + final int originalIndex; + + public MyDoubleForSorting(final double myData, final int originalIndex) { + this.myData = myData; + this.originalIndex = originalIndex; + } + + @Override + public int compareTo(final MyDoubleForSorting other) { + return myData.compareTo(other.myData); + } + } + + /** + * Convenience connector method to work with arrays instead of lists. See ##reorderList## + */ + private T[] reorderArray(final T[] data, final List order) { + return reorderList(Arrays.asList(data), order).toArray(data); + } + + /** + * Reorder the given data list to be in the specified order + * @param data the data to reorder + * @param order the new order to use + * @return a reordered list of data + */ + private List reorderList(final List data, final List order) { + final List returnList = new ArrayList<>(data.size()); + for( final int index : order ) { + returnList.add( data.get(index) ); + } + return returnList; + } + + /** + * Convert a normalized point to it's original annotation value + * + * norm = (orig - mu) / sigma + * orig = norm * sigma + mu + * + * @param normalizedValue the normalized value of the ith annotation + * @param annI the index of the annotation value + * @return the denormalized value for the annotation + */ + public double denormalizeDatum(final double normalizedValue, final int annI) { + final double mu = meanVector[annI]; + final double sigma = varianceVector[annI]; + return normalizedValue * sigma + mu; + } + + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } + + public List getAnnotationKeys() { + return annotationKeys; + } + + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } + + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public List getTrainingData() { + final List trainingData = new ExpandingArrayList<>(); + for( final VariantDatum datum : data ) { + if( datum.atTrainingSite && !datum.failingSTDThreshold ) { + trainingData.add( datum ); + } + } + logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." ); + if( trainingData.size() < VRAC.MIN_NUM_BAD_VARIANTS ) { + logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); + } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) { + logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." ); + Collections.shuffle(trainingData); + return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA); + } + return trainingData; + } + + public List selectWorstVariants() { + final List trainingData = new ExpandingArrayList<>(); + + for( final VariantDatum datum : data ) { + if( datum != null && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) && datum.lod < VRAC.BAD_LOD_CUTOFF ) { + datum.atAntiTrainingSite = true; + trainingData.add( datum ); + } + } + + logger.info( "Training with worst " + trainingData.size() + " scoring variants --> variants with LOD <= " + String.format("%.4f", VRAC.BAD_LOD_CUTOFF) + "." ); + + return trainingData; + } + + public List getEvaluationData() { + final List evaluationData = new ExpandingArrayList<>(); + + for( final VariantDatum datum : data ) { + if( datum != null && !datum.failingSTDThreshold && !datum.atTrainingSite && !datum.atAntiTrainingSite ) { + evaluationData.add( datum ); + } + } + + return evaluationData; + } + + /** + * Remove all VariantDatum's from the data list which are marked as aggregate data + */ + public void dropAggregateData() { + final Iterator iter = data.iterator(); + while (iter.hasNext()) { + final VariantDatum datum = iter.next(); + if( datum.isAggregate ) { + iter.remove(); + } + } + } + + public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { + final List returnData = new ExpandingArrayList<>(); + Collections.shuffle(trainingData); + Collections.shuffle(antiTrainingData); + Collections.shuffle(evaluationData); + returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size()))); + returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size()))); + returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size()))); + Collections.shuffle(returnData); + return returnData; + } + + protected double mean( final int index, final boolean trainingData ) { + double sum = 0.0; + int numNonNull = 0; + for( final VariantDatum datum : data ) { + if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; } + } + return sum / ((double) numNonNull); + } + + protected double standardDeviation( final double mean, final int index, final boolean trainingData ) { + double sum = 0.0; + int numNonNull = 0; + for( final VariantDatum datum : data ) { + if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; } + } + return Math.sqrt( sum / ((double) numNonNull) ); + } + + public void decodeAnnotations( final VariantDatum datum, final VariantContext vc, final boolean jitter ) { + final double[] annotations = new double[annotationKeys.size()]; + final boolean[] isNull = new boolean[annotationKeys.size()]; + int iii = 0; + for( final String key : annotationKeys ) { + isNull[iii] = false; + annotations[iii] = decodeAnnotation( key, vc, jitter ); + if( Double.isNaN(annotations[iii]) ) { isNull[iii] = true; } + iii++; + } + datum.annotations = annotations; + datum.isNull = isNull; + } + + private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) { + double value; + + try { + value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); + if( Double.isInfinite(value) ) { value = Double.NaN; } + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } + } catch( Exception e ) { + value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model + } + + return value; + } + + public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { + datum.isKnown = false; + datum.atTruthSite = false; + datum.atTrainingSite = false; + datum.atAntiTrainingSite = false; + datum.prior = 2.0; + + for( final TrainingSet trainingSet : trainingSets ) { + for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { + if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { + datum.isKnown = datum.isKnown || trainingSet.isKnown; + datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; + datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; + datum.prior = Math.max( datum.prior, trainingSet.prior ); + datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 ); + } + if( trainVC != null ) { + datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; + } + } + } + } + + private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) && + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); + } + + protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) { + switch( trainVC.getType() ) { + case SNP: + case MNP: + return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.SNP ); + case INDEL: + case MIXED: + case SYMBOLIC: + return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); + default: + return false; + } + } + + protected static boolean checkVariationClass( final VariantContext evalVC, final VariantRecalibratorArgumentCollection.Mode mode ) { + switch( mode ) { + case SNP: + return evalVC.isSNP() || evalVC.isMNP(); + case INDEL: + return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + case BOTH: + return true; + default: + throw new ReviewedStingException( "Encountered unknown recal mode: " + mode ); + } + } + + public void writeOutRecalibrationTable( final VariantContextWriter recalWriter ) { + // we need to sort in coordinate order in order to produce a valid VCF + Collections.sort( data, new Comparator() { + public int compare(VariantDatum vd1, VariantDatum vd2) { + return vd1.loc.compareTo(vd2.loc); + }} ); + + // create dummy alleles to be used + final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); + + for( final VariantDatum datum : data ) { + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); + builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); + + recalWriter.add(builder.make()); + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java new file mode 100644 index 000000000..41b27949d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -0,0 +1,86 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.Serializable; +import java.util.Comparator; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 4, 2011 + */ + +public class VariantDatum { + + public double[] annotations; + public boolean[] isNull; + public boolean isKnown; + public double lod; + public boolean atTruthSite; + public boolean atTrainingSite; + public boolean atAntiTrainingSite; + public boolean isTransition; + public boolean isSNP; + public boolean failingSTDThreshold; + public double originalQual; + public double prior; + public int consensusCount; + public GenomeLoc loc; + public int worstAnnotation; + public MultivariateGaussian assignment; // used in K-means implementation + public boolean isAggregate; // this datum was provided to aid in modeling but isn't part of the input callset + + public static class VariantDatumLODComparator implements Comparator, Serializable { + @Override + public int compare(final VariantDatum datum1, final VariantDatum datum2) { + return Double.compare(datum1.lod, datum2.lod); + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java new file mode 100644 index 000000000..c5e2b8183 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -0,0 +1,567 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.PartitionBy; +import org.broadinstitute.sting.gatk.walkers.PartitionType; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.R.RScriptExecutor; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.io.Resource; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. + * + *

+ * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. + *

+ * + *

+ * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. + * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship + * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic + * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the + * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is + * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + *

+ * + *

Inputs

+ *

+ * The input raw variants to be recalibrated. + *

+ * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + * + *

Output

+ *

+ * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. + *

+ * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + * + *

Example

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T VariantRecalibrator \
+ *   -R reference/human_g1k_v37.fasta \
+ *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
+ *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
+ *   -mode SNP \
+ *   -recalFile path/to/output.recal \
+ *   -tranchesFile path/to/output.tranches \
+ *   -rscriptFile path/to/output.plots.R
+ * 
+ * + *

Caveat

+ * + *
    + *
  • The values used in the example above are only meant to show how the command lines are composed. + * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be + * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on + * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.
  • + * + *
  • In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). + * See http://www.r-project.org for more info on how to download and install R.
  • + *
+ */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@PartitionBy(PartitionType.NONE) +public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { + + public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model + public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out + public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set + public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set + private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; + + @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + + ///////////////////////////// + // Inputs + ///////////////////////////// + /** + * These calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. + */ + @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) + public List> inputCollections; + final private List> input = new ArrayList<>(); + + /** + * These additional calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. + */ + @Input(fullName="aggregate", shortName = "aggregate", doc="Additional raw input variants to be used in building the model", required=false) + public List> aggregate; + + /** + * Any set of VCF files to use as lists of training, truth, or known sites. + * Training - The program builds the Gaussian mixture model using input variants that overlap with these training sites. + * Truth - The program uses these truth sites to determine where to set the cutoff in VQSLOD sensitivity. + * Known - The program only uses known sites for reporting purposes (to indicate whether variants are already known or novel). They are not used in any calculations by the algorithm itself. + * Bad - A database of known bad variants can be used to supplement the set of worst ranked variants (compared to the Gaussian mixture model) that the program selects from the data to model "bad" variants. + */ + @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)", required=true) + public List> resource = Collections.emptyList(); + + ///////////////////////////// + // Outputs + ///////////////////////////// + @Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) + protected VariantContextWriter recalWriter = null; + + @Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true) + protected File TRANCHES_FILE; + + ///////////////////////////// + // Additional Command Line Arguments + ///////////////////////////// + /** + * The expected transition / transversion ratio of true novel variants in your targeted region (whole genome, exome, specific + * genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best + * practices documentation (http://www.broadinstitute.org/gatk/guide/best-practices) for more information. + * Normal values are 2.15 for human whole genome values and 3.2 for human whole exomes. Note + * that this parameter is used for display purposes only and isn't used anywhere in the algorithm! + */ + @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false) + protected double TARGET_TITV = 2.15; + + /** + * See the input VCF file's INFO field for a list of all available annotations. + */ + @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true) + private String[] USE_ANNOTATIONS = null; + + /** + * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0 + * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible + * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%. + */ + @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) + private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0}; + /** + * For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command. + */ + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the variant recalibrator will also use variants marked as filtered by the specified filter name in the input VCF file", required=false) + private String[] IGNORE_INPUT_FILTERS = null; + @Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false) + private File RSCRIPT_FILE = null; + + @Hidden + @Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false) + protected int REPLICATE = 200; + private ArrayList replicate = new ArrayList<>(); + + ///////////////////////////// + // Debug Arguments + ///////////////////////////// + @Advanced + @Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false) + protected Boolean TRUST_ALL_POLYMORPHIC = false; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private VariantDataManager dataManager; + private PrintStream tranchesStream; + private final Set ignoreInputFilterSet = new TreeSet<>(); + private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC ); + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public void initialize() { + dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC ); + + if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) + Utils.warnUser(logger, String.format( + "Rscript not found in environment path. %s will be generated but PDF plots will not.", + RSCRIPT_FILE)); + + if( IGNORE_INPUT_FILTERS != null ) { + ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); + } + + try { + tranchesStream = new PrintStream(TRANCHES_FILE); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e); + } + + for( RodBinding rod : resource ) { + dataManager.addTrainingSet( new TrainingSet( rod ) ); + } + + if( !dataManager.checkHasTrainingSet() ) { + throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); + } + if( !dataManager.checkHasTruthSet() ) { + throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); + } + + final Set hInfo = new HashSet<>(); + ApplyRecalibration.addVQSRStandardHeaderLines(hInfo); + recalWriter.writeHeader( new VCFHeader(hInfo) ); + + for( int iii = 0; iii < REPLICATE * 2; iii++ ) { + replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble()); + } + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection inputCollection : inputCollections ) + input.addAll(inputCollection.getRodBindings()); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public ExpandingArrayList map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + final ExpandingArrayList mapList = new ExpandingArrayList<>(); + + if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers + return mapList; + } + + mapList.addAll( addOverlappingVariants(input, true, tracker, context) ); + if( aggregate != null ) { + mapList.addAll( addOverlappingVariants(aggregate, false, tracker, context) ); + } + + return mapList; + } + + /** + * Using the RefMetaDataTracker find overlapping variants and pull out the necessary information to create the VariantDatum + * @param rods the rods to search within + * @param isInput is this rod an -input rod? + * @param tracker the RefMetaDataTracker from the RODWalker map call + * @param context the AlignmentContext from the RODWalker map call + * @return a list of VariantDatums, can be empty + */ + private List addOverlappingVariants( final List> rods, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context ) { + if( rods == null ) { throw new IllegalArgumentException("rods cannot be null."); } + if( tracker == null ) { throw new IllegalArgumentException("tracker cannot be null."); } + if( context == null ) { throw new IllegalArgumentException("context cannot be null."); } + + final ExpandingArrayList variants = new ExpandingArrayList<>(); + + for( final VariantContext vc : tracker.getValues(rods, context.getLocation()) ) { + if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) { + if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) { + final VariantDatum datum = new VariantDatum(); + + // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need. + dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine + datum.loc = ( isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null ); + datum.originalQual = vc.getPhredScaledQual(); + datum.isSNP = vc.isSNP() && vc.isBiallelic(); + datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc); + datum.isAggregate = !isInput; + + // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately + dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC ); + final double priorFactor = QualityUtils.qualToProb( datum.prior ); + datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor ); + + variants.add( datum ); + } + } + } + + return variants; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public ExpandingArrayList reduceInit() { + return new ExpandingArrayList<>(); + } + + @Override + public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { + reduceSum.addAll( mapValue ); + return reduceSum; + } + + @Override + public ExpandingArrayList treeReduce( final ExpandingArrayList lhs, final ExpandingArrayList rhs ) { + rhs.addAll( lhs ); + return rhs; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // on traversal done + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public void onTraversalDone( final ExpandingArrayList reduceSum ) { + dataManager.setData( reduceSum ); + dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation + + // Generate the positive model using the training data and evaluate each variant + final List positiveTrainingData = dataManager.getTrainingData(); + final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS ); + engine.evaluateData( dataManager.getData(), goodModel, false ); + + // Generate the negative model using the worst performing data and evaluate each variant contrastively + final List negativeTrainingData = dataManager.selectWorstVariants(); + final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS)); + dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory + engine.evaluateData( dataManager.getData(), badModel, true ); + + if( badModel.failedToConverge || goodModel.failedToConverge ) { + throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") ); + } + + engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); + + // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user + final int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY ); + final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth ); + final List tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE ); + tranchesStream.print(Tranche.tranchesString( tranches )); + + logger.info( "Writing out recalibration table..." ); + dataManager.writeOutRecalibrationTable( recalWriter ); + if( RSCRIPT_FILE != null ) { + logger.info( "Writing out visualization Rscript file..."); + createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) ); + } + + if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) { + // Print out an info message to make it clear why the tranches plot is not generated + logger.info("Tranches plot will not be generated since we are running in INDEL mode"); + } else { + // Execute the RScript command to plot the table of truth values + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class)); + executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV); + // Print out the command line to make it clear to the user what is being executed and how one might modify it + logger.info("Executing: " + executor.getApproximateCommandLine()); + executor.exec(); + } + } + + private void createVisualizationScript( final List randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) { + PrintStream stream; + try { + stream = new PrintStream(RSCRIPT_FILE); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, e); + } + + // We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/ + stream.println("library(ggplot2)"); + // For compactPDF in R 2.13+ + stream.println("library(tools)"); + // For graphical functions R 2.14.2+ + stream.println("library(grid)"); + + createArrangeFunction( stream ); + + stream.println("outputPDF <- \"" + RSCRIPT_FILE + ".pdf\""); + stream.println("pdf(outputPDF)"); // Unfortunately this is a huge pdf file, BUGBUG: need to work on reducing the file size + + for(int iii = 0; iii < annotationKeys.length; iii++) { + for( int jjj = iii + 1; jjj < annotationKeys.length; jjj++) { + logger.info( "Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot..."); + + final List fakeData = new ExpandingArrayList<>(); + double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0; + for( final VariantDatum datum : randomData ) { + minAnn1 = Math.min(minAnn1, datum.annotations[iii]); + maxAnn1 = Math.max(maxAnn1, datum.annotations[iii]); + minAnn2 = Math.min(minAnn2, datum.annotations[jjj]); + maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]); + } + // Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D + final double NUM_STEPS = 60.0; + for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+= (maxAnn1 - minAnn1) / NUM_STEPS) { + for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+= (maxAnn2 - minAnn2) / NUM_STEPS) { + final VariantDatum datum = new VariantDatum(); + datum.prior = 0.0; + datum.annotations = new double[randomData.get(0).annotations.length]; + datum.isNull = new boolean[randomData.get(0).annotations.length]; + for(int ann=0; ann< datum.annotations.length; ann++) { + datum.annotations[ann] = 0.0; + datum.isNull[ann] = true; + } + datum.annotations[iii] = ann1; + datum.annotations[jjj] = ann2; + datum.isNull[iii] = false; + datum.isNull[jjj] = false; + fakeData.add(datum); + } + } + + engine.evaluateData( fakeData, goodModel, false ); + engine.evaluateData( fakeData, badModel, true ); + + stream.print("surface <- c("); + for( final VariantDatum datum : fakeData ) { + stream.print(String.format("%.4f, %.4f, %.4f, ", + dataManager.denormalizeDatum(datum.annotations[iii], iii), + dataManager.denormalizeDatum(datum.annotations[jjj], jjj), + Math.min(4.0, Math.max(-4.0, datum.lod)))); + } + stream.println("NA,NA,NA)"); + stream.println("s <- matrix(surface,ncol=3,byrow=T)"); + + stream.print("data <- c("); + for( final VariantDatum datum : randomData ) { + stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,", + dataManager.denormalizeDatum(datum.annotations[iii], iii), + dataManager.denormalizeDatum(datum.annotations[jjj], jjj), + (datum.lod < lodCutoff ? -1.0 : 1.0), + (datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1))); + } + stream.println("NA,NA,NA,NA,1)"); + stream.println("d <- matrix(data,ncol=5,byrow=T)"); + + final String surfaceFrame = "sf." + annotationKeys[iii] + "." + annotationKeys[jjj]; + final String dataFrame = "df." + annotationKeys[iii] + "." + annotationKeys[jjj]; + + stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])"); + stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])"); + stream.println("dummyData <- " + dataFrame + "[1,]"); + stream.println("dummyData$x <- NaN"); + stream.println("dummyData$y <- NaN"); + stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")"); + stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)"); + stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))"); + stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)"); + stream.println("p3 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))"); + stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)"); + stream.println("p4 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))"); + stream.println("arrange(p1, p2, p3, p4, ncol=2)"); + } + } + stream.println("dev.off()"); + + stream.println("if (exists(\"compactPDF\")) {"); + stream.println("compactPDF(outputPDF)"); + stream.println("}"); + + stream.close(); + + // Execute Rscript command to generate the clustering plots + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(RSCRIPT_FILE); + logger.info("Executing: " + executor.getApproximateCommandLine()); + executor.exec(); + } + + // The Arrange function is how we place the 4 model plots on one page + // from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html + private void createArrangeFunction( final PrintStream stream ) { + stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)"); + stream.println("arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {"); + stream.println("dots <- list(...)"); + stream.println("n <- length(dots)"); + stream.println("if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}"); + stream.println("if(is.null(nrow)) { nrow = ceiling(n/ncol)}"); + stream.println("if(is.null(ncol)) { ncol = ceiling(n/nrow)}"); + stream.println("grid.newpage()"); + stream.println("pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )"); + stream.println("ii.p <- 1"); + stream.println("for(ii.row in seq(1, nrow)){"); + stream.println("ii.table.row <- ii.row "); + stream.println("if(as.table) {ii.table.row <- nrow - ii.table.row + 1}"); + stream.println("for(ii.col in seq(1, ncol)){"); + stream.println("ii.table <- ii.p"); + stream.println("if(ii.p > n) break"); + stream.println("print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))"); + stream.println("ii.p <- ii.p + 1"); + stream.println("}"); + stream.println("}"); + stream.println("}"); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java new file mode 100644 index 000000000..81067e695 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java @@ -0,0 +1,136 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 4, 2011 + */ + +public class VariantRecalibratorArgumentCollection { + + public enum Mode { + SNP, + INDEL, + BOTH + } + + static Mode parseString(final String input) { + if( input.equals("SNP") ) { return Mode.SNP; } + if( input.equals("INDEL") ) { return Mode.INDEL; } + if( input.equals("BOTH") ) { return Mode.BOTH; } + throw new ReviewedStingException("VariantRecalibrator mode string is unrecognized, input = " + input); + } + + @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels (emitting SNPs untouched in the output VCF); and 3.) BOTH for recalibrating both SNPs and indels simultaneously (for testing purposes only, not recommended for general use).", required = false) + public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; + + @Advanced + @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians for the positive model to try during variational Bayes algorithm.", required=false) + public int MAX_GAUSSIANS = 8; + + @Advanced + @Argument(fullName="maxNegativeGaussians", shortName="mNG", doc="The maximum number of Gaussians for the negative model to try during variational Bayes algorithm. The actual maximum used is the min of the mG and mNG arguments. Note that this number should be small (like 4) to achieve the best results", required=false) + public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 2; + + @Advanced + @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.", required=false) + public int MAX_ITERATIONS = 150; + + @Advanced + @Argument(fullName="numKMeans", shortName="nKM", doc="The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.", required=false) + public int NUM_KMEANS_ITERATIONS = 100; + + @Advanced + @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.", required=false) + public double STD_THRESHOLD = 10.0; + + @Advanced + @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false) + public double SHRINKAGE = 1.0; + + @Advanced + @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false) + public double DIRICHLET_PARAMETER = 0.001; + + @Advanced + @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false) + public double PRIOR_COUNTS = 20.0; + + @Advanced + @Argument(fullName="maxNumTrainingData", shortName="maxNumTrainingData", doc="Maximum number of training data to be used in building the Gaussian mixture model. Training sets large than this will be randomly downsampled.", required=false) + protected int MAX_NUM_TRAINING_DATA = 2500000; + + @Advanced + @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false) + public int MIN_NUM_BAD_VARIANTS = 1000; + + @Advanced + @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false) + public double BAD_LOD_CUTOFF = -5.0; + + ///////////////////////////// + // Deprecated Arguments + // Keeping them here is meant to provide users with error messages that are more informative than "arg not defined" when they use an argument that has been put out of service + ///////////////////////////// + + @Hidden + @Deprecated + @Argument(fullName="percentBadVariants", shortName="percentBad", doc="This argument is no longer used in GATK versions 2.7 and newer. Please see the online documentation for the latest usage recommendations.", required=false) + public double PERCENT_BAD_VARIANTS = 0.03; + + @Hidden + @Deprecated + @Argument(fullName="numBadVariants", shortName="numBad", doc="This argument is no longer used in GATK versions 2.8 and newer. Please see the online documentation for the latest usage recommendations.", required=false) + public int NUM_BAD_VARIANTS = 1000; +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java new file mode 100644 index 000000000..dae3bffa5 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -0,0 +1,172 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 4, 2011 + */ + +public class VariantRecalibratorEngine { + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + + protected final static Logger logger = Logger.getLogger(VariantRecalibratorEngine.class); + public final static double MIN_ACCEPTABLE_LOD_SCORE = -20000.0; + + // the unified argument collection + final private VariantRecalibratorArgumentCollection VRAC; + + private final static double MIN_PROB_CONVERGENCE = 2E-3; + + ///////////////////////////// + // Public Methods to interface with the Engine + ///////////////////////////// + + public VariantRecalibratorEngine( final VariantRecalibratorArgumentCollection VRAC ) { + this.VRAC = VRAC; + } + + public GaussianMixtureModel generateModel( final List data, final int maxGaussians ) { + if( data == null || data.isEmpty() ) { throw new IllegalArgumentException("No data found."); } + if( maxGaussians <= 0 ) { throw new IllegalArgumentException("maxGaussians must be a positive integer but found: " + maxGaussians); } + + final GaussianMixtureModel model = new GaussianMixtureModel( maxGaussians, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS ); + variationalBayesExpectationMaximization( model, data ); + return model; + } + + public void evaluateData( final List data, final GaussianMixtureModel model, final boolean evaluateContrastively ) { + if( !model.isModelReadyForEvaluation ) { + try { + model.precomputeDenominatorForEvaluation(); + } catch( Exception e ) { + model.failedToConverge = true; + return; + } + } + + logger.info("Evaluating full set of " + data.size() + " variants..."); + for( final VariantDatum datum : data ) { + final double thisLod = evaluateDatum( datum, model ); + if( Double.isNaN(thisLod) ) { + model.failedToConverge = true; + return; + } + + datum.lod = ( evaluateContrastively ? + ( Double.isInfinite(datum.lod) ? // positive model said negative infinity + ( MIN_ACCEPTABLE_LOD_SCORE + GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE ) // Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians + : datum.prior + datum.lod - thisLod) // contrastive evaluation: (prior + positive model - negative model) + : thisLod ); // positive model only so set the lod and return + } + } + + public void calculateWorstPerformingAnnotation( final List data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) { + for( final VariantDatum datum : data ) { + int worstAnnotation = -1; + double minProb = Double.MAX_VALUE; + for( int iii = 0; iii < datum.annotations.length; iii++ ) { + final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii); + final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii); + if( goodProbLog10 != null && badProbLog10 != null ) { + final double prob = goodProbLog10 - badProbLog10; + if(prob < minProb) { minProb = prob; worstAnnotation = iii; } + } + } + datum.worstAnnotation = worstAnnotation; + } + } + + + ///////////////////////////// + // Private Methods used for generating a GaussianMixtureModel + ///////////////////////////// + + private void variationalBayesExpectationMaximization( final GaussianMixtureModel model, final List data ) { + + model.initializeRandomModel( data, VRAC.NUM_KMEANS_ITERATIONS ); + + // The VBEM loop + model.normalizePMixtureLog10(); + model.expectationStep( data ); + double currentChangeInMixtureCoefficients; + int iteration = 0; + logger.info("Finished iteration " + iteration + "."); + while( iteration < VRAC.MAX_ITERATIONS ) { + iteration++; + model.maximizationStep( data ); + currentChangeInMixtureCoefficients = model.normalizePMixtureLog10(); + model.expectationStep( data ); + if( iteration % 5 == 0 ) { // cut down on the number of output lines so that users can read the warning messages + logger.info("Finished iteration " + iteration + ". \tCurrent change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients)); + } + if( iteration > 2 && currentChangeInMixtureCoefficients < MIN_PROB_CONVERGENCE ) { + logger.info("Convergence after " + iteration + " iterations!"); + break; + } + } + + model.evaluateFinalModelParameters( data ); + } + + ///////////////////////////// + // Private Methods used for evaluating data given a GaussianMixtureModel + ///////////////////////////// + + private double evaluateDatum( final VariantDatum datum, final GaussianMixtureModel model ) { + return model.evaluateDatum( datum ); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java new file mode 100644 index 000000000..3d1a9da57 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java @@ -0,0 +1,264 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.variant.vcf.*; +import java.util.*; + +/** + * Calculates genotype posterior likelihoods given panel data + * + *

+ * Given a VCF with genotype likelihoods from the HaplotypeCaller, UnifiedGenotyper, or another source which provides + * -unbiased- GLs, calculate the posterior genotype state and likelihood given allele frequency information from + * both the samples themselves and input VCFs describing allele frequencies in related populations. + * + * VCFs to use for informing the genotype likelihoods (e.g. a population-specific VCF from 1000 genomes) should have + * at least one of: + * - AC field and AN field + * - MLEAC field and AN field + * - genotypes + * + * The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval + * or uncertainty around the allele frequency, while AN provides this necessary information. This uncertainty is + * modeled by a Dirichlet distribution: that is, the frequency is known up to a Dirichlet distribution with + * parameters AC1+q,AC2+q,...,(AN-AC1-AC2-...)+q, where "q" is the global frequency prior (typically q << 1). The + * genotype priors applied then follow a Dirichlet-Multinomial distribution, where 2 alleles per sample are drawn + * independently. This assumption of independent draws is the assumption Hardy-Weinberg Equilibrium. Thus, HWE is + * imposed on the likelihoods as a result of CalculateGenotypePosteriors. + * + *

Input

+ *

+ * A VCF with genotype likelihoods, and optionally genotypes, AC/AN fields, or MLEAC/AN fields + *

+ * + *

+ * A collection of VCFs to use for informing allele frequency priors. Each VCF must have one of + * - AC field and AN field + * - MLEAC field and AN field + * - genotypes + *

+ * + *

Output

+ *

+ * A new VCF with: + * 1) Genotype posteriors added to the genotype fields ("GP") + * 2) Genotypes and GQ assigned according to these posteriors + * 3) Per-site genotype priors added to the INFO field ("PG") + *

+ * + *

Examples

+ *
+ * Inform the genotype assignment of NA12878 using the 1000G Euro panel
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -V NA12878.wgs.HC.vcf \
+ *   -VV 1000G_EUR.genotypes.combined.vcf \
+ *   -o NA12878.wgs.HC.posteriors.vcf \
+ *
+ * Refine the genotypes of a large panel based on the discovered allele frequency
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -V input.vcf \
+ *   -o output.withPosteriors.vcf
+ *
+ * Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts
+ * in the allele frequency estimates
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -V input.vcf \
+ *   -o output.withPosteriors.vcf \
+ *   --ignoreInputSamples
+ *
+ * Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel
+ * is tantamount to being AC=0, AN=100 within that panel
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -VV external.panel.vcf \
+ *   -V input.vcf \
+ *   -o output.withPosteriors.vcf
+ *   --numRefSamplesIfNoCall 100
+ *
+ * 
+ * + */ +public class CalculateGenotypePosteriors extends RodWalker { + + /** + * The input VCF (posteriors will be calculated for these samples, and written to the output) + */ + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * Supporting external panels. Allele counts from these panels (taken from AC,AN or MLEAC,AN or raw genotypes) will + * be used to inform the frequency distribution underying the genotype priors. + */ + @Input(fullName="supporting", shortName = "VV", doc="Other callsets to use in generating genotype posteriors", required=false) + public List> supportVariants = new ArrayList>(); + + /** + * The global prior of a variant site -- i.e. the expected allele frequency distribution knowing only that N alleles + * exist, and having observed none of them. This is the "typical" 1/x trend, modeled here as not varying + * across alleles. The calculation for this parameter is (Effective population size) * (steady state mutation rate) + * + */ + @Argument(fullName="globalPrior",shortName="G",doc="The global Dirichlet prior parameters for the allele frequency",required=false) + public double globalPrior = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + + /** + * When a variant is not seen in a panel, whether to infer (and with what effective strength) that only reference + * alleles were ascertained at that site. E.g. "If not seen in 1000Genomes, treat it as AC=0, AN=2000". This is + * applied across all external panels, so if numRefIsMissing = 10, and the variant is absent in two panels, this + * confers evidence of AC=0,AN=20 + */ + @Argument(fullName="numRefSamplesIfNoCall",shortName="nrs",doc="The number of homozygous reference to infer were " + + "seen at a position where an \"other callset\" contains no site or genotype information",required=false) + public int numRefIfMissing = 1; + + /** + * Rather than looking for the MLEAC field first, and then falling back to AC; first look for the AC field and then + * fall back to MLEAC or raw genotypes + */ + @Argument(fullName="defaultToAC",shortName="useAC",doc="Use the AC field as opposed to MLEAC. Does nothing if VCF lacks MLEAC field",required=false) + public boolean defaultToAC = false; + + /** + * Do not use the [MLE] allele count from the input samples (the ones for which you're calculating posteriors) + * in the site frequency distribution; only use the AC and AN calculated from external sources. + */ + @Argument(fullName="ignoreInputSamples",shortName="ext",doc="Use external information only; do not inform genotype priors by "+ + "the discovered allele frequency in the callset whose posteriors are being calculated. Useful for callsets containing "+ + "related individuals.",required=false) + public boolean ignoreInputSamples = false; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + private final boolean NO_EM = false; + + public void initialize() { + // Get list of samples to include in the output + final List rodNames = Arrays.asList(variantCollection.variants.getName()); + + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + + if ( vcfRods.size() > 1 ) + throw new IllegalStateException("Somehow more than one variant was bound?"); + + final VCFHeader header = new ArrayList<>(vcfRods.values()).get(0); // pure laziness + + if ( ! header.hasGenotypingData() ) { + throw new UserException("VCF has no genotypes"); + } + + if ( header.hasInfoLine(VCFConstants.MLE_ALLELE_COUNT_KEY) ) { + final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY); + if ( mleLine.getCountType() != VCFHeaderLineCount.A ) { + throw new UserException("VCF does not have a properly formatted MLEAC field: the count type should be \"A\""); + } + + if ( mleLine.getType() != VCFHeaderLineType.Integer ) { + throw new UserException("VCF does not have a properly formatted MLEAC field: the field type should be \"Integer\""); + } + } + + final TreeSet vcfSamples = new TreeSet<>(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + // Initialize VCF header + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_POSTERIORS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Posterior Genotype Likelihoods")); + headerLines.add(new VCFInfoHeaderLine("PG", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Genotype Likelihood Prior")); + headerLines.add(new VCFHeaderLine("source", "CalculateGenotypePosteriors")); + + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + } + + public Integer reduceInit() { return 0; } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || context == null || ref == null ) { + return 0; + } + + final Collection vcs = tracker.getValues(variantCollection.variants, ref.getLocus()); + + final Collection otherVCs = tracker.getValues(supportVariants, context.getLocation()); + + final int missing = supportVariants.size() - otherVCs.size(); + + for ( VariantContext vc : vcs ) { + vcfWriter.add(PosteriorLikelihoodsUtils.calculatePosteriorGLs(vc, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, NO_EM, defaultToAC)); + } + + return 1; + } + + public Integer reduce(Integer l, Integer r) { return r + l; } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java new file mode 100644 index 000000000..0f577cb23 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java @@ -0,0 +1,326 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Combines any number of gVCF files that were produced by the Haplotype Caller into a single joint gVCF file. + * + *

+ * CombineGVCFs is meant to be used for hierarchical merging of gVCFs that will eventually be input into GenotypeGVCFs. + * One would use this tool when needing to genotype too large a number of individual gVCFs; instead of passing them + * all in to GenotypeGVCFs, one would first use CombineGVCFs on smaller batches of samples and then pass these combined + * gVCFs to GenotypeGVCFs. + * + * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller + * as part of the "single sample discovery" pipeline using the '-ERC GVCF' mode, which uses a sophisticated reference + * model to produce accurate genotype likelihoods for every position in the target. + * + *

Input

+ *

+ * One or more Haplotype Caller gVCFs to combine. + *

+ * + *

Output

+ *

+ * A combined VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineGVCFs \
+ *   --variant gvcf1.vcf \
+ *   --variant gvcf2.vcf \
+ *   -o mergeGvcf.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=0,stop=1)) +public class CombineGVCFs extends RodWalker { + + protected final class PositionalState { + final List VCs; + final byte[] refBases; + final GenomeLoc loc; + public PositionalState(final List VCs, final byte[] refBases, final GenomeLoc loc) { + this.VCs = VCs; + this.refBases = refBases; + this.loc = loc; + } + } + + protected final class OverallState { + final LinkedList VCs = new LinkedList<>(); + GenomeLoc prevPos = null; + byte refAfterPrevPos; + + public OverallState() {} + } + + /** + * The gVCF files to merge together + */ + @Input(fullName="variant", shortName = "V", doc="One or more input gVCF files", required=true) + public List> variantCollections; + final private List> variants = new ArrayList<>(); + + @Output(doc="File to which the combined gVCF should be written") + protected VariantContextWriter vcfWriter = null; + + private GenomeLocParser genomeLocParser; + + public void initialize() { + // take care of the VCF headers + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection variantCollection : variantCollections ) + variants.addAll(variantCollection.getRodBindings()); + + genomeLocParser = getToolkit().getGenomeLocParser(); + } + + public PositionalState map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + final GenomeLoc loc = ref.getLocus(); + return new PositionalState(tracker.getValues(variants, loc), ref.getBases(), loc); + } + + public OverallState reduceInit() { + return new OverallState(); + } + + public OverallState reduce(final PositionalState startingStates, final OverallState previousState) { + if ( startingStates == null ) + return previousState; + + final int currentPos = startingStates.loc.getStart(); + + if ( !startingStates.VCs.isEmpty() ) { + if ( ! okayToSkipThisSite(currentPos, previousState.prevPos) ) + endPreviousStates(previousState, currentPos - 1, startingStates.refBases[0]); + previousState.VCs.addAll(startingStates.VCs); + } + + if ( containsEndingContext(previousState.VCs, currentPos) ) { + endPreviousStates(previousState, currentPos, startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N'); + } + + return previousState; + } + + /** + * Is it okay to skip the given position? + * + * @param thisPos this position + * @param lastPosRun the last position for which we created a VariantContext + * @return true if it is okay to skip this position, false otherwise + */ + private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) { + return lastPosRun != null && thisPos == lastPosRun.getStart() + 1; + } + + /** + * Does the given list of VariantContexts contain any whose context ends at the given position? + * + * @param VCs list of VariantContexts + * @param pos the position to check against + * @return true if there are one or more VCs that end at pos, false otherwise + */ + private boolean containsEndingContext(final List VCs, final int pos) { + if ( VCs == null ) throw new IllegalArgumentException("The list of VariantContexts cannot be null"); + + for ( final VariantContext vc : VCs ) { + if ( isEndingContext(vc, pos) ) + return true; + } + return false; + } + + /** + * Does the given variant context end (in terms of reference blocks, not necessarily formally) at the given position. + * Note that for the purposes of this method/tool, deletions are considered to be single base events (as opposed to + * reference blocks), hence the check for the number of alleles (because we know there will always be a allele). + * + * @param vc the variant context + * @param pos the position to query against + * @return true if this variant context "ends" at this position, false otherwise + */ + private boolean isEndingContext(final VariantContext vc, final int pos) { + return vc.getNAlleles() > 2 || vc.getEnd() == pos; + } + + /** + * Disrupt the VariantContexts so that they all stop at the given pos, write them out, and put the remainder back in the list. + * + * @param state the state with list of VariantContexts + * @param pos the target ending position + * @param refBase the reference base to use at the position AFTER pos + */ + private void endPreviousStates(final OverallState state, final int pos, final byte refBase) { + + final List stoppedVCs = new ArrayList<>(state.VCs.size()); + + for ( int i = state.VCs.size() - 1; i >= 0; i-- ) { + final VariantContext vc = state.VCs.get(i); + if ( vc.getStart() <= pos ) { + + stoppedVCs.add(vc); + + // if it was ending anyways, then remove it from the future state + if ( isEndingContext(vc, pos) ) + state.VCs.remove(i); + } + } + + if ( !stoppedVCs.isEmpty() ) { + final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos); + + // we need the specialized merge if the site contains anything other than ref blocks + final VariantContext mergedVC; + if ( containsTrueAltAllele(stoppedVCs) ) + mergedVC = GATKVariantContextUtils.referenceConfidenceMerge(stoppedVCs, gLoc, refBase, false); + else + mergedVC = referenceBlockMerge(stoppedVCs, state, pos); + + vcfWriter.add(mergedVC); + state.prevPos = gLoc; + state.refAfterPrevPos = refBase; + } + } + + /** + * Combine a list of reference block VariantContexts. + * We can't use GATKVariantContextUtils.simpleMerge() because it is just too slow for this sort of thing. + * + * @param VCs the variant contexts to merge + * @param state the state object + * @param end the end of this block (inclusive) + * @return a new merged VariantContext + */ + private VariantContext referenceBlockMerge(final List VCs, final OverallState state, final int end) { + + final VariantContext first = VCs.get(0); + + // ref allele and start + final Allele refAllele; + final int start; + if ( state.prevPos == null || !state.prevPos.getContig().equals(first.getChr()) || first.getStart() >= state.prevPos.getStart() + 1) { + start = first.getStart(); + refAllele = first.getReference(); + } else { + start = state.prevPos.getStart() + 1; + refAllele = Allele.create(state.refAfterPrevPos, true); + } + + // attributes + final Map attrs = new HashMap<>(1); + attrs.put(VCFConstants.END_KEY, Integer.toString(end)); + + // genotypes + final GenotypesContext genotypes = GenotypesContext.create(); + for ( final VariantContext vc : VCs ) { + for ( final Genotype g : vc.getGenotypes() ) + genotypes.add(new GenotypeBuilder(g).alleles(Arrays.asList(refAllele, refAllele)).make()); + } + + return new VariantContextBuilder("", first.getChr(), start, end, Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)).attributes(attrs).genotypes(genotypes).make(); + } + + /** + * Does the given list of VariantContexts contain any with an alternate allele other than ? + * + * @param VCs list of VariantContexts + * @return true if there are one or more VCs that contain a true alternate allele, false otherwise + */ + private boolean containsTrueAltAllele(final List VCs) { + if ( VCs == null ) throw new IllegalArgumentException("The list of VariantContexts cannot be null"); + + for ( final VariantContext vc : VCs ) { + if ( vc.getNAlleles() > 2 ) + return true; + } + return false; + } + + @Override + public void onTraversalDone(final OverallState state) { + // there shouldn't be any state left unless the user cut in the middle of a gVCF block + if ( !state.VCs.isEmpty() ) + logger.warn("You have asked for an interval that cuts in the middle of one or more gVCF blocks. Please note that this will cause you to lose records that don't end within your interval."); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java new file mode 100644 index 000000000..a6d151df8 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java @@ -0,0 +1,276 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Genotypes any number of gVCF files that were produced by the Haplotype Caller into a single joint VCF file. + * + *

+ * GenotypeGVCFs merges gVCF records that were produced as part of the "single sample discovery" pipeline using + * the '-ERC GVCF' mode of the Haplotype Caller. This tool performs the multi-sample joint aggregation + * step and merges the records together in a sophisticated manner. + * + * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, + * re-genotype the newly merged record, and then re-annotate it. + * + * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller, + * which uses a sophisticated reference model to produce accurate genotype likelihoods for every position in the target. + * + *

Input

+ *

+ * One or more Haplotype Caller gVCFs to genotype. + *

+ * + *

Output

+ *

+ * A combined, genotyped VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T GenotypeGVCFs \
+ *   --variant gvcf1.vcf \
+ *   --variant gvcf2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-10,stop=10)) +public class GenotypeGVCFs extends RodWalker implements AnnotatorCompatible, TreeReducible { + + /** + * The gVCF files to merge together + */ + @Input(fullName="variant", shortName = "V", doc="One or more input gVCF files", required=true) + public List> variantCollections; + final private List> variants = new ArrayList<>(); + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + // TODO -- currently this option doesn't actually work; must fix + @Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false) + public boolean INCLUDE_NON_VARIANTS = false; + + /** + * Which annotations to recompute for the combined output VCF file. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"})); + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + // the genotyping engine + private UnifiedGenotyperEngine genotypingEngine; + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + public List> getCompRodBindings() { return Collections.emptyList(); } + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + + public void initialize() { + // create the annotation engine + annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); + + // take care of the VCF headers + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + if ( dbsnp != null && dbsnp.dbsnp.isBound() ) + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); + + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + + // create the genotyping engine + genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), new UnifiedArgumentCollection(), logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection variantCollection : variantCollections ) + variants.addAll(variantCollection.getRodBindings()); + } + + public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + final GenomeLoc loc = ref.getLocus(); + final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null, true); + if ( combinedVC == null ) + return null; + + return regenotypeVC(tracker, ref, combinedVC); + } + + /** + * Re-genotype (and re-annotate) a combined genomic VC + * + * @param tracker the ref tracker + * @param ref the ref context + * @param originalVC the combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext originalVC) { + if ( originalVC == null ) throw new IllegalArgumentException("originalVC cannot be null"); + + VariantContext result = originalVC; + + // only re-genotype polymorphic sites + if ( result.isVariant() ) { + VariantContext regenotypedVC = genotypingEngine.calculateGenotypes(result); + if ( regenotypedVC == null ) + return null; + + regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC); + + // we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations + final Map attrs = new HashMap<>(originalVC.getAttributes()); + attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); + attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + + result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make(); + } + + // if it turned monomorphic and we don't want such sites, quit + if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() ) + return null; + + // re-annotate it + result = annotationEngine.annotateContext(tracker, ref, null, result); + + // fix some of the annotations + return new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result.getGenotypes())).make(); + } + + /** + * Cleans up genotype-level annotations that need to be updated. + * 1. move MIN_DP to DP if present + * 2. remove SB is present + * + * @param newGs the new Genotypes to fix + * @return a new set of Genotypes + */ + private List cleanupGenotypeAnnotations(final GenotypesContext newGs) { + final List recoveredGs = new ArrayList<>(newGs.size()); + for ( final Genotype newG : newGs ) { + final Map attrs = new HashMap<>(newG.getExtendedAttributes()); + + final GenotypeBuilder builder = new GenotypeBuilder(newG); + + // move the MIN_DP to DP + if ( newG.hasExtendedAttribute("MIN_DP") ) { + builder.DP(newG.getAttributeAsInt("MIN_DP", 0)); + attrs.remove("MIN_DP"); + } + + // remove SB + attrs.remove("SB"); + + recoveredGs.add(builder.noAttributes().attributes(attrs).make()); + } + return recoveredGs; + } + + public VariantContextWriter reduceInit() { + return vcfWriter; + } + + public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) { + if ( vc != null ) + writer.add(vc); + return writer; + } + + @Override + public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) { + return lhs; + } + + @Override + public void onTraversalDone(final VariantContextWriter writer) {} +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java new file mode 100644 index 000000000..d9b0c575e --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java @@ -0,0 +1,308 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.util.*; + +public class PosteriorLikelihoodsUtils { + + public static VariantContext calculatePosteriorGLs(final VariantContext vc1, + final Collection resources, + final int numRefSamplesFromMissingResources, + final double globalFrequencyPriorDirichlet, + final boolean useInputSamples, + final boolean useEM, + final boolean useAC) { + if ( useEM ) + throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented"); + + final Map totalAlleleCounts = new HashMap<>(); + + //store the allele counts for each allele in the variant priors + for ( final VariantContext resource : resources ) { + addAlleleCounts(totalAlleleCounts,resource,useAC); + } + + //add the allele counts from the input samples (if applicable) + if ( useInputSamples ) { + addAlleleCounts(totalAlleleCounts,vc1,useAC); + } + + //add zero allele counts for any reference alleles not seen in priors (if applicable) + totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources); + + // now extract the counts of the alleles present within vc1, and in order + final double[] alleleCounts = new double[vc1.getNAlleles()]; + int alleleIndex = 0; + for ( final Allele allele : vc1.getAlleles() ) { + + alleleCounts[alleleIndex++] = globalFrequencyPriorDirichlet + ( totalAlleleCounts.containsKey(allele) ? + totalAlleleCounts.get(allele) : 0 ); + } + + //parse the likelihoods for each sample's genotype + final List likelihoods = new ArrayList<>(vc1.getNSamples()); + for ( final Genotype genotype : vc1.getGenotypes() ) { + likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); + } + + final List posteriors = calculatePosteriorGLs(likelihoods,alleleCounts,vc1.getMaxPloidy(2)); + + final GenotypesContext newContext = GenotypesContext.create(); + for ( int genoIdx = 0; genoIdx < vc1.getNSamples(); genoIdx ++ ) { + final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx)); + if ( posteriors.get(genoIdx) != null ) { + GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, + GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); + builder.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, + Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); + } + newContext.add(builder.make()); + } + + final List priors = Utils.listFromPrimitives( + GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2))).getAsPLs()); + + final VariantContextBuilder builder = new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG", priors); + // add in the AC, AF, and AN attributes + VariantContextUtils.calculateChromosomeCounts(builder, true); + return builder.make(); + } + + /** + * Given genotype likelihoods and known allele counts, calculate the posterior likelihoods + * over the genotype states + * @param genotypeLikelihoods - the genotype likelihoods for the individual + * @param knownAlleleCountsByAllele - the known allele counts in the population. For AC=2 AN=12 site, this is {10,2} + * @param ploidy - the ploidy to assume + * @return - the posterior genotype likelihoods + */ + protected static List calculatePosteriorGLs(final List genotypeLikelihoods, + final double[] knownAlleleCountsByAllele, + final int ploidy) { + if ( ploidy != 2 ) { + throw new IllegalStateException("Genotype posteriors not yet implemented for ploidy != 2"); + } + + final double[] genotypePriorByAllele = getDirichletPrior(knownAlleleCountsByAllele,ploidy); + final List posteriors = new ArrayList<>(genotypeLikelihoods.size()); + for ( final double[] likelihoods : genotypeLikelihoods ) { + double[] posteriorProbabilities = null; + + if ( likelihoods != null ) { + if ( likelihoods.length != genotypePriorByAllele.length ) { + throw new IllegalStateException(String.format("Likelihoods not of correct size: expected %d, observed %d", + knownAlleleCountsByAllele.length*(knownAlleleCountsByAllele.length+1)/2,likelihoods.length)); + } + + posteriorProbabilities = new double[genotypePriorByAllele.length]; + for ( int genoIdx = 0; genoIdx < likelihoods.length; genoIdx ++ ) { + posteriorProbabilities[genoIdx] = likelihoods[genoIdx] + genotypePriorByAllele[genoIdx]; + } + + posteriorProbabilities = MathUtils.normalizeFromLog10(posteriorProbabilities, true); + + } + + posteriors.add(posteriorProbabilities); + } + + return posteriors; + } + + // convenience function for a single genotypelikelihoods array. Just wraps. + protected static double[] calculatePosteriorGLs(final double[] genotypeLikelihoods, + final double[] knownAlleleCountsByAllele, + final int ploidy) { + return calculatePosteriorGLs(Arrays.asList(genotypeLikelihoods),knownAlleleCountsByAllele,ploidy).get(0); + } + + + /** + * Given known allele counts (whether external, from the sample, or both), calculate the prior distribution + * over genotype states. This assumes + * 1) Random sampling of alleles (known counts are unbiased, and frequency estimate is Dirichlet) + * 2) Genotype states are independent (Hardy-Weinberg) + * These assumptions give rise to a Dirichlet-Multinomial distribution of genotype states as a prior + * (the "number of trials" for the multinomial is simply the ploidy) + * @param knownCountsByAllele - the known counts per allele. For an AC=2, AN=12 site this is {10,2} + * @param ploidy - the number of chromosomes in the sample. For now restricted to 2. + * @return - the Dirichlet-Multinomial distribution over genotype states + */ + protected static double[] getDirichletPrior(final double[] knownCountsByAllele, final int ploidy) { + if ( ploidy != 2 ) { + throw new IllegalStateException("Genotype priors not yet implemented for ploidy != 2"); + } + + // multi-allelic format is + // AA AB BB AC BC CC AD BD CD DD ... + final double sumOfKnownCounts = MathUtils.sum(knownCountsByAllele); + final double[] priors = new double[knownCountsByAllele.length*(knownCountsByAllele.length+1)/2]; + int priorIndex = 0; + for ( int allele2 = 0; allele2 < knownCountsByAllele.length; allele2++ ) { + for ( int allele1 = 0; allele1 <= allele2; allele1++) { + final int[] counts = new int[knownCountsByAllele.length]; + counts[allele1] += 1; + counts[allele2] += 1; + priors[priorIndex++] = MathUtils.dirichletMultinomial(knownCountsByAllele,sumOfKnownCounts,counts,ploidy); + } + } + + return priors; + } + + /** + * Parse counts for each allele + * @param counts - Map to store and return data + * @param context - line to be parsed from the input VCF file + * @param useAC - use allele count annotation value from VariantContext (vs. MLEAC) + */ + private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { + final int[] ac; + //use MLEAC value... + if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { + ac = getAlleleCounts(VCFConstants.MLE_ALLELE_COUNT_KEY, context); + } + //...unless specified by the user in useAC or unless MLEAC is absent + else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + ac = getAlleleCounts(VCFConstants.ALLELE_COUNT_KEY, context); + } + //if VariantContext annotation doesn't contain AC or MLEAC then get the data from direct evaluation + else { + ac = new int[context.getAlternateAlleles().size()]; + int idx = 0; + for ( final Allele allele : context.getAlternateAlleles() ) { + ac[idx++] = context.getCalledChrCount(allele); + } + } + + //since the allele count for the reference allele is not given in the VCF format, + //calculate it from the allele number minus the total counts for alternate alleles + for ( final Allele allele : context.getAlleles() ) { + final int count; + if ( allele.isReference() ) { + if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { + count = Math.max(context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac),0); //occasionally an MLEAC value will sneak in that's greater than the AN + } else { + count = Math.max(context.getCalledChrCount() - (int) MathUtils.sum(ac),0); + } + } else { + count = ac[context.getAlternateAlleles().indexOf(allele)]; + } + //if this allele isn't in the map yet, add it + if ( ! counts.containsKey(allele) ) { + counts.put(allele,0); + } + //add the count for the current allele to the existing value in the map + counts.put(allele,count + counts.get(allele)); + } + } + + /** + * Retrieve allele count data from VariantContext using VCFkey, checks for correct number of values in VCF + * @param VCFkey VariantContext annotation tag of interest (should be AC or MLEAC) + * @param context VariantContext from which to extract the data + * @return int[] with allele count data + */ + private static int[] getAlleleCounts(final String VCFkey, final VariantContext context) { + final Object alleleCountsFromVCF = context.getAttribute(VCFkey); + if ( alleleCountsFromVCF instanceof List ) { + if ( ((List) alleleCountsFromVCF).size() != context.getAlternateAlleles().size() ) + throw new UserException(String.format("Variant does not contain the same number of MLE allele counts as alternate alleles for record at %s:%d", context.getChr(), context.getStart())); + } + else if ( alleleCountsFromVCF instanceof String || alleleCountsFromVCF instanceof Integer) {//here length is 1 + if (context.getAlternateAlleles().size() != 1) + throw new UserException(String.format("Variant does not contain the same number of MLE allele counts as alternate alleles for record at %s:%d", context.getChr(), context.getStart())); + } + return extractInts(alleleCountsFromVCF); + } + + /** + * Check the formatting on the Object returned by a call to VariantContext::getAttribute() and parse appropriately + * @param integerListContainingVCField - Object returned by a call to VariantContext::getAttribute() + * @return - array of ints + */ + public static int[] extractInts(final Object integerListContainingVCField) { + List mleList = null; + if ( integerListContainingVCField instanceof List ) { + if ( ((List) integerListContainingVCField).get(0) instanceof String ) { + mleList = new ArrayList<>(((List) integerListContainingVCField).size()); + for ( Object s : ((List)integerListContainingVCField)) { + mleList.add(Integer.parseInt((String) s)); + } + } else { + mleList = (List) integerListContainingVCField; + } + } else if ( integerListContainingVCField instanceof Integer ) { + mleList = Arrays.asList((Integer) integerListContainingVCField); + } else if ( integerListContainingVCField instanceof String ) { + mleList = Arrays.asList(Integer.parseInt((String)integerListContainingVCField)); + } + if ( mleList == null ) + throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+ + VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); + + final int[] mle = new int[mleList.size()]; + + if ( ! ( mleList.get(0) instanceof Integer ) ) { + throw new IllegalStateException("BUG: The AC values should be an Integer, but was "+mleList.get(0).getClass().getCanonicalName()); + } + + for ( int idx = 0; idx < mle.length; idx++) { + mle[idx] = mleList.get(idx); + } + + return mle; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java diff --git a/protected/java/src/org/broadinstitute/sting/package-info.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/package-info.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/package-info.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/package-info.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/SequenceComplexity.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/SequenceComplexity.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/collections/CountSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/collections/CountSet.java new file mode 100644 index 000000000..5c7dbd505 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/collections/CountSet.java @@ -0,0 +1,516 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.collections; + +import com.google.java.contract.Requires; + +import java.lang.reflect.Array; +import java.util.*; + +/** + * Efficient implementation for a small set of integer primitive values. + *

+ * It includes a increment operation incAll which is convenient when analyzing the read-threading graphs. Nevertheless + * it can be also be used in general purpose. + *

+ *

+ * It does not provide a O(1) look-up of its elements though. These are kept in a sorted array so look up is implemented + * using a binary search O(log n). Therefore it might not be optimal for problems that require large integer sets. + *

+ *

+ * Also note that addition can be costly for large sets unless done in order: O(n). + *

+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class CountSet implements Cloneable, Set { + + /** + * The size of the set. + */ + private int size; + + /** + * Holds the element of the set within the subrange [0 .. size - 1] in ascending order. + */ + private int[] elements; + + /** + * Creates a copy of an existing int-set. + * @param template the intset to copy values from. + */ + public CountSet(final CountSet template) { + elements = template.elements.clone(); + size = template.size; + } + + /** + * Creates a new set indicating the expected maximum number of elements it will contain. + * @param initialCapacity the desired initial capacity of the set. + * @throws IllegalArgumentException if initialCapacity is negative. + */ + public CountSet(int initialCapacity) { + if (initialCapacity < 0) + throw new IllegalArgumentException(); + elements = new int[initialCapacity]; + size = 0; + } + + /** + * Set the set contents to a single integer value. + * @param value the integer value to set the set to. + */ + public void setTo(int value) { + ensureCapacity(1); + size = 1; + elements[0] = value; + } + + /** + * Set the content of this set to a collection of integers. + * @param values the new values to be included in the set. + * @throws NullPointerException if value is null. + */ + public void setTo(int ... values) { + ensureCapacity(values.length); + size = values.length; + System.arraycopy(values, 0, elements, 0, size); + Arrays.sort(elements,0,size); + } + + /** + * Increase (or decrease) all elements in the set by a number. + * @param delta the number of add (or substract if negative) to all elements. + * + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean incAll(final int delta) { + if (size == 0 || delta == 0) + return false; + for (int i = 0; i < size; i++) + elements[i] += delta; + return true; + } + + /** + * Returns the smallest integer value in the set. + * + * @throws NoSuchElementException if the set is empty (thus there is no minimum). + * @return the smallest integer value in the set. + */ + public int min() { + if (size == 0) + throw new NoSuchElementException("cannot have a min from an empty set"); + return elements[0]; + } + + /** + * Returns the largest integer value in the set. + * + * @throws NoSuchElementException if the set is empty (thus there is no maximum). + * @return the largest integer value in the set. + */ + public int max() { + if (size == 0) + throw new NoSuchElementException("cannot have a max from an empty set"); + return elements[size - 1]; + } + + /** + * Adds a range of integer values to the collection. + * + * This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive. + * You can indicate a decrease range (fromValue > toValue). + * + * @param fromValue the first value to add in the set (inclusive). + * @param toValue the last value to add to the set (inclusive). + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addRange(final int fromValue, final int toValue) { + final int lowEnd; + final int highEnd; + + if (fromValue <= toValue) { + lowEnd = fromValue; highEnd = toValue; + } else { + highEnd = fromValue; lowEnd = toValue; + } + + //TODO to be optimized to add missing sub-ranges in one go: + boolean result = false; + for (int i = lowEnd; i <= highEnd; i++) + result = add(i) | result; + return result; + } + + /** + * Add an integer value to the set. + * @param value to add to the set. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean add(final int value) { + int pos = Arrays.binarySearch(elements,0,size,value); + if (pos >= 0) return false; + int insertPos = - pos - 1; + ensureCapacity(size + 1); + System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos); + elements[insertPos] = value; + size++; + return true; + } + + /** + * Add a arbitrary number of integers to the set. + * + * @param values integer to add to the set. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addAll(final int ... values) { + ensureCapacity(size + values.length); + boolean result = false; + for (final int v : values) + result = add(v) | result; + return result; + } + + @Override + public boolean addAll(final Collection numbers) { + ensureCapacity(size + numbers.size()); + boolean result = false; + for (final Number n : numbers) + result = add(n.intValue()) | result; + return result; + } + + /** + * Add all values within a range in an integer array. + * + * @param source array where the values to add are found. + * @param fromIndex first position from source to add (inclusive). + * @param toIndex index after the last position in source to add (thus exclusive). + * @throws NullPointerException if source is null. + * @throws NegativeArraySizeException if fromIndex or toIndex are negative. + * @throws ArrayIndexOutOfBoundsException if fromIndex or toIndex are beyond bounds + * allowed [0 .. source.length]. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addAll(final int[] source, final int fromIndex, final int toIndex) { + ensureCapacity(size + source.length); + boolean result = false; + for (int i = fromIndex; i < toIndex; i++) + result = add(source[i]) | result; + return result; + } + + + /** + * Add all elements present in a int-set. + * + * @param other the other inset. + * + * @throws NullPointerException if other is null. + * @return true if this set changed due to this operation, false otherwise. + */ + public boolean addAll(final CountSet other) { + return addAll(other.elements,0,other.size); + } + + /** + * Checks whether a integer value is included in the set. + * @param value the value to check. + * @return true if value is inside the set, false otherwise. + */ + public boolean contains(final int value) { + return Arrays.binarySearch(elements, 0, size, value) >= 0; + } + + /** + * Make sure that this int-set has capacity to handle a number of elements. + *

+ * If the set has already that or greater capacity nothing would be changed. + * + * @param capacity the requested capacity. + */ + private void ensureCapacity(final int capacity) { + if (elements.length >= capacity) return; + int newLength = Math.max(elements.length << 1, capacity); + elements = Arrays.copyOf(elements,newLength); + } + + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public boolean contains(final Object o) { + if (o instanceof Integer) { + final int i = (Integer)o; + return contains(i); + } else + return false; //To change body of implemented methods use File | Settings | File Templates. + } + + + @Override + public Iterator iterator() { + return new MyIterator(); + } + + @Override + public Object[] toArray() { + final Integer[] result = new Integer[size]; + for (int i = 0; i < size; i++) + result[i] = elements[i]; + return result; + } + + @Override + @SuppressWarnings("unchecked") + public T[] toArray(final T[] a) { + if (a == null) + throw new NullPointerException(); + + @SuppressWarnings("unchecked") + final Class componentClass = (Class) a.getClass().getComponentType(); + if (!componentClass.isAssignableFrom(Integer.class)) + throw new ArrayStoreException(); + + @SuppressWarnings("unchecked") + final T[] dest = (a.length < size) ? (T[]) Array.newInstance(componentClass, size) : a; + + for (int i = 0; i < size; i++) + dest[i] = (T) (Integer) elements[i]; + return dest; + } + + /** + * Copies the content of the set into an integer array. The result can be freely modified by the invoker. + * @return never null but a zero-length array if the set is empty. + */ + public int[] toIntArray() { + return Arrays.copyOfRange(elements,0,size); + } + + /** + * Copy the content of the set into an array. + * @param dest the destination array. + * @param offset where to store the first element of the set. + * @throws NullPointerException if dest is null. + * @throws ArrayIndexOutOfBoundsException if offset is out of range of there is not enough + * space after offset in the destination array to hold all values in the set. + */ + public void copyTo(final int[] dest, int offset) { + if (dest == null) + throw new NullPointerException(); + if (dest.length < (size + offset)) + throw new ArrayIndexOutOfBoundsException("destination is to short"); + System.arraycopy(elements,0,dest,offset,size); + } + + /** + * Copy the content of the set into an array. + * @param dest the destination array. + * @throws NullPointerException if dest is null. + * @throws ArrayIndexOutOfBoundsException if there is not enough + * space after offset in the destination array to hold all values in the set. + */ + public void copyTo(final int[] dest) { + copyTo(dest,0); + } + + + @Override + public boolean add(final Integer integer) { + return add((int) integer); + } + + @Override + public boolean remove(final Object o) { + return o instanceof Integer && remove((int)o); + } + + /** + * Removes a single integer value for the set. + * @param i the value to remove. + * @return true if the set has changed as a result of this invocation, false otherwise. + */ + public boolean remove(final int i) { + final int pos = Arrays.binarySearch(elements,0,size,i); + if (pos < 0) + return false; + else { + removeIndex(pos); + return true; + } + } + + @Override + public boolean containsAll(final Collection c) { + for (final Object o : c) + if (!contains(o)) + return false; + return true; + } + + + @Override + public boolean retainAll(final Collection c) { + if (size == 0) + return false; + @SuppressWarnings("all") + final CountSet retainIndices = new CountSet(c.size() + 2); + retainIndices.add(-1); + retainIndices.add(size); + for (final Object o : c) { + if (!(o instanceof Integer)) + continue; + final int pos = Arrays.binarySearch(elements,0,size,(int) o); + if (pos < 0) + continue; + retainIndices.add(pos); + } + if (retainIndices.size == 2) { + size = 0; + return true; + } else if (retainIndices.size == size + 2) { + return false; + } else { + for (int idx = retainIndices.size - 1; idx > 0; idx--) { + final int toIdx = retainIndices.elements[idx]; + final int fromIdx = retainIndices.elements[idx - 1] + 1; + removeIndices(toIdx,fromIdx); + } + return true; + } + } + + /** + * Removes the values found in a range of indexes in {@link #elements}. + * @param fromIdx first index to remove (inclusive). + * @param toIdx right after last index to remove (exclusive). + */ + @Requires("fromIdx >= toIdx & fromIdx >= 0 & toIdx <= size") + private void removeIndices(final int fromIdx, final int toIdx) { + System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx); + size -= toIdx - fromIdx; + } + + @Override + public boolean removeAll(final Collection c) { + boolean result = false; + for (final Object o : c) + result = remove(o) | result; + return result; + } + + @Requires("idx >= 0 && idx < size") + private void removeIndex(int idx) { + System.arraycopy(elements,idx+1,elements,idx,size - idx - 1); + } + + @Override + public void clear() { + size = 0; + } + + /** + * Returns a copy of this set which can be changed without modifying the original one. + * @return never {@code null}. + */ + @SuppressWarnings("all") + public CountSet clone() { + return new CountSet(this); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(2 + size() * 10); + sb.append('{'); + for (int i = 0; i < size; i++) + sb.append(elements[i]).append(','); + sb.replace(sb.length()-1,sb.length(),"}"); + return sb.toString(); + + } + + + /** + * Custom iterator class for {@link CountSet IntSets} + */ + private class MyIterator implements Iterator { + /** What position I am in. */ + private int next = 0; + + @Override + public boolean hasNext() { + return next < size; + } + + @Override + public Integer next() { + if (next >= size) + throw new NoSuchElementException(); + return elements[next]; + } + + @Override + public void remove() { + if (next == 0) + throw new IllegalStateException(); + if (next >= size) + throw new NoSuchElementException(); + removeIndex(next - 1); + } + } + + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java new file mode 100644 index 000000000..71d61c920 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -0,0 +1,318 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; + +/** + * Genome-wide VCF writer + * + * User: depristo + * Date: 6/24/13 + * Time: 2:51 PM + */ +public class GVCFWriter implements VariantContextWriter { + // + // static VCF field names + // + protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE"; + protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; + protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ"; + + // + // Final fields initialized in constructor + // + /** Where we'll ultimately write our VCF records */ + final private VariantContextWriter underlyingWriter; + + final private List GQPartitions; + + /** fields updated on the fly during GVCFWriter operation */ + int nextAvailableStart = -1; + String contigOfNextAvailableStart = null; + private String sampleName = null; + private HomRefBlock currentBlock = null; + + /** + * Is the proposed GQ partitions well-formed? + * + * @param GQPartitions proposed GQ partitions + * @return a non-null string if something is wrong (string explains issue) + */ + protected static List parsePartitions(final List GQPartitions) { + if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); + if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); + + final List result = new LinkedList<>(); + int lastThreshold = 0; + for ( final Integer value : GQPartitions ) { + if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); + if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); + if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); + result.add(new HomRefBlock(lastThreshold, value)); + lastThreshold = value; + } + result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE)); + + return result; + } + + /** + * Create a new GVCF writer + * + * Should be a non-empty list of boundaries. For example, suppose this variable is + * + * [A, B, C] + * + * We would partition our hom-ref sites into the following bands: + * + * X < A + * A <= X < B + * B <= X < C + * X >= C + * + * @param underlyingWriter the ultimate destination of the GVCF records + * @param GQPartitions a well-formed list of GQ partitions + */ + public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) { + if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null"); + this.underlyingWriter = underlyingWriter; + this.GQPartitions = parsePartitions(GQPartitions); + } + + /** + * Write the VCF header + * + * Adds standard GVCF fields to the header + * + * @param header a non-null header + */ + @Override + public void writeHeader(VCFHeader header) { + if ( header == null ) throw new IllegalArgumentException("header cannot be null"); + header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); + + // These annotations are no longer standard + //header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block")); + //header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block")); + + for ( final HomRefBlock partition : GQPartitions ) { + header.addMetaDataLine(partition.toVCFHeaderLine()); + } + + underlyingWriter.writeHeader(header); + } + + /** + * Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well + */ + @Override + public void close() { + close(true); + } + + /** + * Horrible work around because there's no clean way to get our VCFWriter closed by the GATK + * + * If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open + * so the GATK closes it later + * + * @param closeUnderlyingWriter should we leave the underlying writer open or closed? + */ + public void close(final boolean closeUnderlyingWriter) { + emitCurrentBlock(); + if ( closeUnderlyingWriter ) underlyingWriter.close(); + } + + /** + * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate + * + * @param vc a non-null VariantContext + * @param g a non-null genotype from VariantContext + * @return a VariantContext to be emitted, or null if non is appropriate + */ + protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { + if ( nextAvailableStart != -1 ) { + // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) + if ( vc.getStart() <= nextAvailableStart && vc.getChr().equals(contigOfNextAvailableStart) ) { + return null; + } + // otherwise, reset to non-relevant + nextAvailableStart = -1; + contigOfNextAvailableStart = null; + } + + if ( currentBlock == null ) { + currentBlock = createNewBlock(vc, g); + return null; + } else if ( currentBlock.withinBounds(g.getGQ()) ) { + currentBlock.add(vc.getStart(), g); + return null; + } else { + final VariantContext result = blockToVCF(currentBlock); + currentBlock = createNewBlock(vc, g); + return result; + } + } + + /** + * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null + */ + private void emitCurrentBlock() { + if ( currentBlock != null ) { + // there's actually some work to do + underlyingWriter.add(blockToVCF(currentBlock)); + currentBlock = null; + } + } + + /** + * Convert a HomRefBlock into a VariantContext + * + * @param block the block to convert + * @return a VariantContext representing the gVCF encoding for this block + */ + private VariantContext blockToVCF(final HomRefBlock block) { + if ( block == null ) throw new IllegalArgumentException("block cannot be null"); + + final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC()); + vcb.attributes(new HashMap(2)); // clear the attributes + vcb.stop(block.getStop()); + vcb.attribute(VCFConstants.END_KEY, block.getStop()); + + // This annotation is no longer standard + //vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize()); + + // create the single Genotype with GQ and DP annotations + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef())); + gb.noAD().noPL().noAttributes(); // clear all attributes + gb.GQ(block.getMedianGQ()); + gb.DP(block.getMedianDP()); + gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); + gb.PL(block.getMinPLs()); + + // This annotation is no longer standard + //gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); + + return vcb.genotypes(gb.make()).make(); + } + + /** + * Helper function to create a new HomRefBlock from a variant context and current genotype + * + * @param vc the VariantContext at the site where want to start the band + * @param g the genotype of the sample from vc that should be used to initialize the block + * @return a newly allocated and initialized block containing g already + */ + private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) { + // figure out the GQ limits to use based on the GQ of g + HomRefBlock partition = null; + for ( final HomRefBlock maybePartition : GQPartitions ) { + if ( maybePartition.withinBounds(g.getGQ()) ) { + partition = maybePartition; + break; + } + } + if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition); + + // create the block, add g to it, and return it for use + final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound()); + block.add(vc.getStart(), g); + return block; + } + + /** + * Add a VariantContext to this writer for emission + * + * Requires that the VC have exactly one genotype + * + * @param vc a non-null VariantContext + */ + @Override + public void add(VariantContext vc) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + + if ( sampleName == null ) + sampleName = vc.getGenotype(0).getSampleName(); + + if ( ! vc.hasGenotypes() ) { + throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes"); + } else if ( vc.getGenotypes().size() != 1 ) { + throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size()); + } else { + if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) { + // we've made a non-contiguous step (across interval, onto another chr), so finalize + emitCurrentBlock(); + } + + final Genotype g = vc.getGenotype(0); + if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) && vc.isBiallelic() ) { + // create bands + final VariantContext maybeCompletedBand = addHomRefSite(vc, g); + if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); + } else { + // g is variant, so flush the bands and emit vc + emitCurrentBlock(); + nextAvailableStart = vc.getEnd(); + contigOfNextAvailableStart = vc.getChr(); + underlyingWriter.add(vc); + } + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java new file mode 100644 index 000000000..9d14fca26 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -0,0 +1,185 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFHeaderLine; + +import java.util.ArrayList; +import java.util.List; + +/** + * Helper class for calculating a GQ band in the GVCF writer + * + * A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes, + * and provides summary information about the entire block of genotypes. + * + * Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores + * + * User: depristo + * Date: 6/25/13 + * Time: 9:41 AM + */ +final class HomRefBlock { + private final VariantContext startingVC; + private int stop; + private final int minGQ, maxGQ; + private int[] minPLs = null; + final private List GQs = new ArrayList<>(100); + final private List DPs = new ArrayList<>(100); + private final Allele ref; + + /** + * Create a new HomRefBlock + * + * @param startingVC the VariantContext that starts this band (for starting position information) + * @param minGQ the minGQ (inclusive) to use in this band + * @param maxGQ the maxGQ (exclusive) to use in this band + */ + public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) { + if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null"); + if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); + + this.startingVC = startingVC; + this.stop = getStart() - 1; + this.ref = startingVC.getReference(); + this.minGQ = minGQ; + this.maxGQ = maxGQ; + } + + /** + * Create a new HomRefBlock only for doing bounds checking + * + * @param minGQ the minGQ (inclusive) to use in this band + * @param maxGQ the maxGQ (exclusive) to use in this band + */ + public HomRefBlock(int minGQ, int maxGQ) { + if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); + + this.startingVC = null; + this.stop = -1; + this.ref = null; + this.minGQ = minGQ; + this.maxGQ = maxGQ; + } + + /** + * Add information from this Genotype to this band + * @param g a non-null Genotype with GQ and DP attributes + */ + public void add(final int pos, final Genotype g) { + if ( g == null ) throw new IllegalArgumentException("g cannot be null"); + if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); + if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); + if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + + if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs = PL.clone(); + } + } else { // otherwise take the min with the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs[0] = Math.min(minPLs[0], PL[0]); + minPLs[1] = Math.min(minPLs[1], PL[1]); + minPLs[2] = Math.min(minPLs[2], PL[2]); + } + } + stop = pos; + GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission + DPs.add(Math.max(g.getDP(),0)); + } + + /** + * Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ) + * @param GQ the GQ value to test + * @return true if within bounds, false otherwise + */ + public boolean withinBounds(final int GQ) { + return GQ >= minGQ && GQ < maxGQ; + } + + /** Get the min GQ observed within this band */ + public int getMinGQ() { return MathUtils.arrayMin(GQs); } + /** Get the median GQ observed within this band */ + public int getMedianGQ() { return MathUtils.median(GQs); } + /** Get the min DP observed within this band */ + public int getMinDP() { return MathUtils.arrayMin(DPs); } + /** Get the median DP observed within this band */ + public int getMedianDP() { return MathUtils.median(DPs); } + /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ + public int[] getMinPLs() { return minPLs; } + + protected int getGQUpperBound() { return maxGQ; } + protected int getGQLowerBound() { return minGQ; } + + public boolean isContiguous(final VariantContext vc) { + return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr()); + } + + public VariantContext getStartingVC() { return startingVC; } + public int getStart() { return startingVC.getStart(); } + public int getStop() { return stop; } + public Allele getRef() { return ref; } + public int getSize() { return getStop() - getStart() + 1; } + + @Override + public String toString() { + return "HomRefBlock{" + + "minGQ=" + minGQ + + ", maxGQ=" + maxGQ + + '}'; + } + + public VCFHeaderLine toVCFHeaderLine() { + return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/LDMerger.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/LDMerger.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java new file mode 100644 index 000000000..2a7ead6c2 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -0,0 +1,335 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.Cigar; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMTag; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; + +import java.util.Collection; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * A BAMWriter that aligns reads to haplotypes and emits their best alignments to a BAM file + * + * User: depristo + * Date: 2/22/13 + * Time: 2:59 PM + */ +public abstract class HaplotypeBAMWriter { + /** + * Allows us to write out unique names for our synthetic haplotype reads + */ + private long uniqueNameCounter = 1; + + protected final static String READ_GROUP_ID = "ArtificialHaplotype"; + protected final static String HAPLOTYPE_TAG = "HC"; + + private final ReadDestination output; + private boolean writeHaplotypesAsWell = true; + private boolean onlyRealignInformativeReads = false; + + /** + * Possible modes for writing haplotypes to BAMs + */ + public static enum Type { + /** + * A mode that's for method developers. Writes out all of the possible + * haplotypes considered, as well as reads aligned to each + */ + ALL_POSSIBLE_HAPLOTYPES, + + /** + * A mode for users. Writes out the reads aligned only to the called + * haplotypes. Useful to understand why the caller is calling what it is + */ + CALLED_HAPLOTYPES + } + + /** + * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer + * + * @param type the type of the writer we want to create + * @param stingSAMWriter the destination, must not be null + * @param header the header of the input BAMs used to make calls, must not be null + * @return a new HaplotypeBAMWriter + */ + public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) { + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + final ReadDestination toBam = new ReadDestination.ToBAM(stingSAMWriter, header, READ_GROUP_ID); + return create(type, toBam); + } + + /** + * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer + * + * Note that writer must have its presorted bit set to false, as reads + * may come in out of order during writing + * + * @param type the type of the writer we want to create + * @param destination the destination, must not be null + * @return a new HaplotypeBAMWriter + */ + public static HaplotypeBAMWriter create(final Type type, final ReadDestination destination) { + if ( destination == null ) throw new IllegalArgumentException("writer cannot be null"); + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + switch ( type ) { + case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(destination); + case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(destination); + default: throw new IllegalArgumentException("Unknown type " + type); + } + } + + /** + * Create a new HaplotypeBAMWriter writing its output to bamWriter + * + * Assumes that the header has been fully initialized with a single + * read group READ_GROUP_ID + * + * @param output our output destination + */ + protected HaplotypeBAMWriter(final ReadDestination output) { + this.output = output; + } + + /** + * Write out a BAM representing for the haplotype caller at this site + * + * @param haplotypes a list of all possible haplotypes at this loc + * @param paddedReferenceLoc the span of the based reference here + * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping + * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference + * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes + */ + public abstract void writeReadsAlignedToHaplotypes(final Collection haplotypes, + final GenomeLoc paddedReferenceLoc, + final Collection bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap); + + public void writeReadsAlignedToHaplotypes(final Collection haplotypes, + final GenomeLoc paddedReferenceLoc, + final Map stratifiedReadMap) { + writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap); + } + + /** + * Write out read aligned to haplotype to the BAM file + * + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + * @param isInformative true if the read is differentially informative for one of the haplotypes + */ + protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart, + final boolean isInformative) { + if( onlyRealignInformativeReads && !isInformative ) { + if( originalRead != null ) { + output.add(originalRead); + } + } else if (haplotype == null) { + output.add(originalRead); + return; + } else { + final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); + if ( alignedToRef != null ) { + output.add(alignedToRef); + } else { + output.add(originalRead); + } + } + } + + /** + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + * @param isInformative true if the read is differentially informative for one of the haplotypes + * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible + */ + protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart, + final boolean isInformative) { + if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null"); + if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null"); + if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype); + if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); + + try { + // compute the smith-waterman alignment of read -> haplotype + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), CigarUtils.NEW_SW_PARAMETERS); + //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); + if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) + // sw can fail (reasons not clear) so if it happens just don't write the read + return null; + final Cigar swCigar = AlignmentUtils.consolidateCigar(swPairwiseAlignment.getCigar()); + + // since we're modifying the read we need to clone it + final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone(); + + addHaplotypeTag(read, haplotype); + + // uninformative reads are set to zero mapping quality to enhance visualization + if ( !isInformative ) + read.setMappingQuality(0); + + // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar + final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000); + final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1()); + final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; + read.setAlignmentStart(readStartOnReference); + + // compute the read -> ref alignment by mapping read -> hap -> ref from the + // SW of read -> hap mapped through the given by hap -> ref + final Cigar haplotypeToRef = AlignmentUtils.trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1); + final Cigar readToRefCigarRaw = AlignmentUtils.applyCigarToCigar(swCigar, haplotypeToRef); + final Cigar readToRefCigarClean = AlignmentUtils.cleanUpCigar(readToRefCigarRaw); + final Cigar readToRefCigar = AlignmentUtils.leftAlignIndel(readToRefCigarClean, haplotype.getBases(), + originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true); + + read.setCigar(readToRefCigar); + + if ( readToRefCigar.getReadLength() != read.getReadLength() ) + throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() + + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() + + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); + + return read; + } catch ( CloneNotSupportedException e ) { + throw new IllegalStateException("GATKSAMRecords should support clone but this one does not " + originalRead); + } + } + + /** + * Add a haplotype tag to the read based on haplotype + * + * @param read the read to add the tag to + * @param haplotype the haplotype that gives rises to read + */ + private void addHaplotypeTag(final GATKSAMRecord read, final Haplotype haplotype) { + // add a tag to the read that indicates which haplotype it best aligned to. It's a uniquish integer + read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); + } + + /** + * Write out haplotypes as reads to the BAM, marking specifically those that are among the best haplotypes + * + * @param haplotypes a collection of haplotypes to write to the BAM + * @param bestHaplotypes a subset of haplotypes that contains those that are best "either good or called" + * @param paddedReferenceLoc the genome loc of the padded reference + */ + protected void writeHaplotypesAsReads(final Collection haplotypes, + final Set bestHaplotypes, + final GenomeLoc paddedReferenceLoc) { + if ( isWriteHaplotypesAsWell() ) + for ( final Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); + } + + /** + * Write out a representation of this haplotype as a read + * + * @param haplotype a haplotype to write out. Cannot be null + * @param paddedRefLoc the reference location. Cannot be null + * @param isAmongBestHaplotypes true if among the best haplotypes, false if it was just one possible but not so good + */ + private void writeHaplotype(final Haplotype haplotype, + final GenomeLoc paddedRefLoc, + final boolean isAmongBestHaplotypes) { + final GATKSAMRecord record = new GATKSAMRecord(output.getHeader()); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); + record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + addHaplotypeTag(record, haplotype); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(paddedRefLoc.getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); + record.setFlags(16); + output.add(record); + } + + public boolean isWriteHaplotypesAsWell() { + return writeHaplotypesAsWell; + } + + public void setWriteHaplotypesAsWell(final boolean writeHaplotypesAsWell) { + this.writeHaplotypesAsWell = writeHaplotypesAsWell; + } + + public boolean getOnlyRealignInformativeReads() { + return onlyRealignInformativeReads; + } + + public void setOnlyRealignInformativeReads(final boolean onlyRealignInformativeReads) { + this.onlyRealignInformativeReads = onlyRealignInformativeReads; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java new file mode 100644 index 000000000..e818c9899 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -0,0 +1,436 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +/** + * Created with IntelliJ IDEA. + * User: bradt + * Date: 6/11/13 + */ +public class ArrayLoglessPairHMM extends PairHMM { + private static final double INITIAL_CONDITION = Math.pow(2, 1020); + private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + + // Array declarations for arrays implementation + private double[] currentMatchArray = null; + private double[] currentDeleteArray = null; + private double[] currentInsertArray = null; + private double[] parentMatchArray = null; + private double[] parentDeleteArray = null; + private double[] parentInsertArray = null; + private double[] grandparentMatchArray = null; + private double[] grandparentDeleteArray = null; + private double[] grandparentInsertArray = null; + + // When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading + private double[] matchCacheArray = null; + private double[] deleteCacheArray = null; + private double[] insertCacheArray = null; + + // These arrays store cache info for use with the next haplotype; for writing + private double[] nextMatchCacheArray = null; + private double[] nextDeleteCacheArray = null; + private double[] nextInsertCacheArray = null; + + // Used when caching to store our intermediate sum at point of first difference bw successive haplotypes + private double partialSum; + + + /** + * {@inheritDoc} + */ + @Override + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + transition = PairHMMModel.createTransitionMatrix(maxReadLength); + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + // Initialize all arrays + // Final Cell of array is a padding cell, initialized to zero. + currentMatchArray = new double[paddedMaxReadLength]; + currentDeleteArray = new double[paddedMaxReadLength]; + currentInsertArray = new double[paddedMaxReadLength]; + + parentMatchArray = new double[paddedMaxReadLength]; + parentDeleteArray = new double[paddedMaxReadLength]; + parentInsertArray = new double[paddedMaxReadLength]; + + grandparentMatchArray = new double[paddedMaxReadLength]; + grandparentDeleteArray = new double[paddedMaxReadLength]; + grandparentInsertArray = new double[paddedMaxReadLength]; + + // Initialize the special arrays used for caching when successive haplotypes have a common prefix + matchCacheArray = new double[paddedMaxReadLength]; + deleteCacheArray = new double[paddedMaxReadLength]; + insertCacheArray = new double[paddedMaxReadLength]; + + nextMatchCacheArray = new double[paddedMaxReadLength]; + nextDeleteCacheArray = new double[paddedMaxReadLength]; + nextInsertCacheArray = new double [paddedMaxReadLength]; + } + + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + if ( ! constantsAreInitialized) { + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + + // note that we initialized the constants + constantsAreInitialized = true; + } + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + // Some housekeeping to be done if we are starting a new read + if (recacheReadValues) { + hapStartIndex = 0; + + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + // note that we initialized the constants + constantsAreInitialized = true; + + // Read length may have changed, so we need to set zero-value padding at the appropriate position. + padMatchAndInsertArrays(readBases.length); + } + + // if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration + if (hapStartIndex == 0) { + clearPreviouslyCachedInfo(readBases.length); + + // Haplotype length may have changed, so we need to set initial-value padding at the appropriate position. + padDeleteArrays(haplotypeBases.length, readBases.length); + } + + // We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + clearArraySolutionPosition(); + + // Some parameters to control behavior during the dynamic programming loop + final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1; + int startFill; // The lower bound of the array indices we want to over-write + int endFill; // The upper bound of the array indices we want to over-write + final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype + double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction + + // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix + for (int i = 1; i <= maxDiagonals; i++) { + // set the bounds for cells we wish to fill in the arrays + startFill = Math.max(readBases.length - i, 0); + endFill = Math.min(maxDiagonals - i + 1, readBases.length); + + // apply any previously cached array information + if (i <= readBases.length) + applyPreviouslyCachedInfo(startFill); + + // fill in the cells for our current arrays + updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i); + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I arrays, because they're no longer extended. + // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above. + finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0]; + + // Partial sum for caching the next haplotype: + // At the position of the last similar base between this haplotype and the next one... + // ...remember the partial sum, so that we can start here on the next hap. + if (i == cacheSumIndex) + partialSum = finalArraySumProbabilities; + + rotateArrayReferences(); + } + // The cache arrays we wrote for this haplotype will be read for the next haplotype. + rotateCacheArrays(); + + //return result + return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + PairHMMModel.qualToTransProbs(transition,insertionGOP,deletionGOP,overallGCP); + } + + /** + * Pad the ends of the Match and Insert arrays with 0. + * Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM. + * + * @param padPosition Which index in the arrays we wish to pad + */ + private void padMatchAndInsertArrays(final int padPosition) { + grandparentMatchArray[padPosition] = 0; + grandparentInsertArray[padPosition] = 0; + parentMatchArray[padPosition] = 0; + parentInsertArray[padPosition] = 0; + currentMatchArray[padPosition] = 0; + currentInsertArray[padPosition] = 0; + matchCacheArray[padPosition] = 0; + insertCacheArray[padPosition] = 0; + nextMatchCacheArray[padPosition] = 0; + nextInsertCacheArray[padPosition] = 0; + } + + /** + * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment. + * Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM. + * + * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value + * @param padPosition Which index in the arrays we wish to pad + */ + private void padDeleteArrays(final int haplotypeLength, final int padPosition) { + final double initialValue = INITIAL_CONDITION / haplotypeLength; + + // Pad the deletion arrays. Akin to padding the first row in the deletion matrix + parentDeleteArray[padPosition] = initialValue; + grandparentDeleteArray[padPosition] = initialValue; + currentDeleteArray[padPosition] = initialValue; + deleteCacheArray[padPosition] = initialValue; + nextDeleteCacheArray[padPosition] = initialValue; + } + + /** + * We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + * + */ + private void clearArraySolutionPosition() { + grandparentMatchArray[0] = 0; + grandparentInsertArray[0] = 0; + parentMatchArray[0] = 0; + parentInsertArray[0] = 0; + currentMatchArray[0] = 0; + currentInsertArray[0] = 0; + } + + /** + * Clears cached information saved from the last haplotype, + * allowing us to start at the beginning of the present haplotype with intitial values of 0. + * + * @param fillLength How much of the cache arrays do we need to zero + */ + private void clearPreviouslyCachedInfo(final int fillLength) { + Arrays.fill(matchCacheArray, 0, fillLength, 0); + Arrays.fill(deleteCacheArray, 0, fillLength, 0); + Arrays.fill(insertCacheArray, 0, fillLength, 0); + + partialSum = 0; + } + + /** + * Applies cached information saved from the last haplotype, + * allowing us to start in the middle of the present haplotype. + * + * @param indK the index in the arrays we wish to update with cached info + */ + private void applyPreviouslyCachedInfo(int indK) { + // apply caching info necessary for calculating current DELETE array values + parentMatchArray[indK] = matchCacheArray[indK]; + parentDeleteArray[indK] = deleteCacheArray[indK]; + + // apply caching info necessary for calculating current MATCH array values + grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1]; + grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1]; + grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1]; + } + + /** + * Records the mid-process state of one location in the read/haplotype alignment. + * Writes new cache information for use with the next haplotype we see. + * + * @param indK the index in the cache arrays we wish to store information in + */ + private void recordNewCacheInfo(int indK) { + nextMatchCacheArray[indK] = currentMatchArray[indK]; + nextDeleteCacheArray[indK] = currentDeleteArray[indK]; + nextInsertCacheArray[indK] = currentInsertArray[indK]; + } + + /** + * Update the HMM arrays for the current diagonal. + * + * @param readLength The length of the read + * @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype + * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info + * @param startFill The lower bound of the array indices we want to over-write + * @param endFill The upper bound of the array indices we want to over-write + * @param iii The index indicating which diagonal of the read/haplotype alignment we are working on + */ + private void updateArrays(final int readLength, + final int hapStartIndex, + final int nextHapStartIndex, + final int startFill, + final int endFill, + final int iii) { + + // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment + int matrixRow; + int matrixCol; + + int arrayIndex; + for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { + // translate the array position into a row, column in the priors and transition matrices + matrixRow = readLength - arrayIndex - 1; + matrixCol = iii - matrixRow - 1 + hapStartIndex; + + // update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col + updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); + + // Set up caching for the next haplotype + // At the position of the final similar base between this haplotype and the next one, remember the mid-array values + if (matrixCol == nextHapStartIndex - 1) + recordNewCacheInfo(arrayIndex); + } + } + + /** + * Updates a cell in the HMM arrays + * + * @param indK index in the arrays to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + private void updateArrayCell( final int indK, final double prior, final double[] transition) { + currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] + + grandparentInsertArray[indK + 1] * transition[indelToMatch] + + grandparentDeleteArray[indK + 1] * transition[indelToMatch] ); + currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion]; + currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; + } + + /** + * To prepare for the next diagonal in our loop, each array must be bumped to an older generation + * + */ + private void rotateArrayReferences() { + double[] tempMatchArray = grandparentMatchArray; + double[] tempDeleteArray = grandparentDeleteArray; + double[] tempInsertArray = grandparentInsertArray; + + grandparentMatchArray = parentMatchArray; + grandparentDeleteArray = parentDeleteArray; + grandparentInsertArray = parentInsertArray; + + parentMatchArray = currentMatchArray; + parentDeleteArray = currentDeleteArray; + parentInsertArray = currentInsertArray; + + currentMatchArray = tempMatchArray; + currentDeleteArray = tempDeleteArray; + currentInsertArray = tempInsertArray; + } + + /** + * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays + * + */ + private void rotateCacheArrays() { + matchCacheArray = nextMatchCacheArray.clone(); + deleteCacheArray = nextDeleteCacheArray.clone(); + insertCacheArray = nextInsertCacheArray.clone(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java new file mode 100644 index 000000000..72d5c9472 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java @@ -0,0 +1,822 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +/** + * Fast partial PairHMM backed on the standard Logless PairHMM + * + */ +public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { + + + /** + * Initial read length capacity. + */ + private static final int INITIAL_READ_LENGTH_CAPACITY = 200; + + /** + * Initial haplotype length capacity. + */ + private static final int INITIAL_HAPLOTYPE_LENGTH_CAPACITY = 400; + + + /** + * Holds the current read capacity. + *

It can only go up overtime.

+ */ + private int readCapacity = INITIAL_READ_LENGTH_CAPACITY; + + /** + * Holds the current haplotype length capacity. + *

It can only go up overtime.

+ */ + private int haplotypeCapacity = INITIAL_HAPLOTYPE_LENGTH_CAPACITY; + + private int maxToCol; + private int haplotypeLength; + + /** + * Returns the currently loaded read base qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + public byte[] getReadQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readQuals; + } + + /** + * Returns the currently loaded read insertion qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadInsQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readInsQuals; + } + + /** + * Returns the currently loaded read deletion qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadDelQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readDelQuals; + } + + /** + * Returns the currently loaded read gap extension penalty.. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadGepQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readGepQuals; + } + + + /** + * Creates a new pair-hmm calculator instance give the gap continuation penalty. + * + * @param gcp the gap-continuation penalty. + */ + public FastLoglessPairHMM(final byte gcp) { + constantGCP = gcp; + initialize(readCapacity,haplotypeCapacity); + } + + @Override + public byte getGapExtensionPenalty() { + return constantGCP; + } + + + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, final int nextHapStartIndex) { + this.readBases = readBases; + this.haplotypeBases = haplotypeBases; + this.haplotypeLength = haplotypeBases.length; + return super.subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases,readBases,readQuals, + insertionGOP,deletionGOP,overallGCP,hapStartIndex,recacheReadValues,nextHapStartIndex); + } + + /** + * Implement the last step summation to calculate the total likelihood. + * + * @param row number of the last row of the pair-hmm where the likelihood values are present. + * @param fromCol inclusive first column to include in the summation. + * @param toCol exclusive last column to include in the summation. + * @return 0 or less. + */ + protected double finalLikelihoodCalculation(final int row, + final int fromCol, final int toCol) { + + final double divider = Math.max(1,2 *(toCol - fromCol)); + final double dividerInverse = 1.0 / divider; + double finalLikelihood = 0; + + for (int j = fromCol; j < toCol; j++) { + finalLikelihood += matchMatrix[row][j] * dividerInverse; + finalLikelihood += insertionMatrix[row][j] * dividerInverse; + } + return StrictMath.log10(finalLikelihood) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + + /** + * Initialize the matrix values for a problem including the trailing end of the read. + * + *

+ * Notice that you can improve performance by omitting filling reusable values from + * previous haplotype calculations. You can set {@code haplotypeStartOffset} to skill + * those columns. + *

+ * + * @param readStart inclusive first position of the read used in the calculations. + * @param readEnd exclusive last position of the read considered in the calculations. + * @param haplotypeStartOffset offset of the haplotype right after the reusable prefix + * from previous calls. + * + * + */ + protected void initializeMatrixValuesForTrailingProblem(final int readStart, final int readEnd, + final int haplotypeStartOffset) { + + @SuppressWarnings("all") + final int zeroRow = readStart; + final int toRow = readEnd + 1; + final int toCol = haplotypeLength + 1; + + // fill first row with -Inf fot M and I but not for Deletion if leading + // to allow for free deletions at the beginning. + if (readStart == 0) { + // First row initialization: + Arrays.fill(matchMatrix[zeroRow],haplotypeStartOffset,toCol,0); + Arrays.fill(deletionMatrix[zeroRow],haplotypeStartOffset,toCol,INITIAL_CONDITION); + + if (haplotypeStartOffset == 0) + for (int i = zeroRow + 1; i < toRow; i++) + insertionMatrix[i][0] = matchMatrix[i][0] = deletionMatrix[i][0] = 0; + + } else { + Arrays.fill(matchMatrix[zeroRow], Math.max(1,haplotypeStartOffset), toCol,0); + Arrays.fill(insertionMatrix[zeroRow], haplotypeStartOffset, toCol,0); + if (haplotypeStartOffset == 0) { + matchMatrix[zeroRow][0] = INITIAL_CONDITION; + deletionMatrix[zeroRow][0] = 0; + } + if (haplotypeStartOffset <= 1) deletionMatrix[zeroRow][1] = matchMatrix[zeroRow][1] * transition[zeroRow][matchToDeletion]; + for (int i = Math.max(haplotypeStartOffset,2); i < toCol; i++) { + deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] + * transition[zeroRow][deletionToDeletion]; + } + + if (haplotypeStartOffset == 0) { + matchMatrix[zeroRow + 1][0] = deletionMatrix[zeroRow + 1][0] = 0; + insertionMatrix[zeroRow + 1][0] = matchMatrix[zeroRow][0] * transition[zeroRow + 1][matchToInsertion]; + + + for (int i = zeroRow + 2; i < toRow; i++) { + matchMatrix[i][0] = deletionMatrix[i][0] = 0; + insertionMatrix[i][0] = insertionMatrix[i - 1][0] + * transition[i][insertionToInsertion]; + } + } + } + } + + /** + * Initializes calculation matrices give the characteristics of the next and previous problems. + * @param currentProblem reference to the Lk calculation problem we are dealing currently. + * @param previousProblem reference to the Lk calculation problem that has been solved just before. + * + */ + protected void initializeMatrixValues(final Problem currentProblem, final Problem previousProblem) { + if (previousProblem != null && + previousProblem.readStart == currentProblem.readStart && + previousProblem.hapStart == currentProblem.hapStart && + maxToCol >= currentProblem.hapEnd + 1) + return; + + final int zeroRow = currentProblem.readStart; + final int zeroCol = currentProblem.hapStart; + final int toRow = currentProblem.readEnd + 1; + final int toCol = currentProblem.hapEnd + 1; + maxToCol = toCol; + + // fill first row with -Inf fot M and I but not for Deletion if leading + // to allow for free deletions at the beginning. + if (currentProblem.leading) { + // First row initialization: + Arrays.fill(matchMatrix[zeroRow],zeroCol,toCol,0); + Arrays.fill(deletionMatrix[zeroRow],zeroCol,toCol,INITIAL_CONDITION); + + for (int i = zeroRow + 1; i < toRow; i++) + insertionMatrix[i][zeroCol] = matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; + + } else { // If not leading set the appropriate matching 1.0 prob and + // deletion + extension. + + Arrays.fill(matchMatrix[zeroRow], zeroCol + 1, toCol,0); + Arrays.fill(insertionMatrix[zeroRow], zeroCol, toCol,0); + matchMatrix[zeroRow][zeroCol] = INITIAL_CONDITION; + deletionMatrix[zeroRow][zeroCol] = 0; + deletionMatrix[zeroRow][zeroCol + 1] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow][matchToDeletion]; + for (int i = zeroCol + 2; i < toCol; i++) { + deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] + * transition[zeroRow][deletionToDeletion]; + } + + matchMatrix[zeroRow + 1][zeroCol] = deletionMatrix[zeroRow + 1][zeroCol] = 0; + insertionMatrix[zeroRow + 1][zeroCol] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow + 1][matchToInsertion]; + + for (int i = zeroRow + 2; i < toRow; i++) { + matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; + insertionMatrix[i][zeroCol] = insertionMatrix[i - 1][zeroCol] + * transition[i][insertionToInsertion]; + } + } + } + + /** + * Constant gap-continuation-penalty. + */ + private final byte constantGCP; + + /** + * Currently loaded haplotype base sequence. + */ + private byte[] haplotypeBases; + + /** + * Currently loaded read base sequence. + */ + private byte[] readBases; + + /** + * Read qualities. + */ + private byte[] readQuals; + + /** + * Read insertion qualities. + */ + private byte[] readInsQuals; + + /** + * Read deletion qualities. + */ + private byte[] readDelQuals; + + /** + * Read gap-extension-penalties. + */ + private byte[] readGepQuals; + + /** + * Cached results. + */ + private Map cachedResults = new HashMap<>(); + + /** + * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. + * + * @param read the target read. + * @throws NullPointerException if {@code read} is null. + */ + @Override + public void loadRead(final GATKSAMRecord read) { + loadRead(read.getReadBases(),read.getBaseQualities(),read.getBaseInsertionQualities(),read.getBaseDeletionQualities(),read.getMappingQuality()); + } + + /** + * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. + * + * @param readBases the read bases. + * @param readQuals the read base call quality scores. + * @param readInsQuals the read insertion quality scores. + * @param readDelQuals the read deletion quality scores. + * @param mq the read mapping quality score. + * @throws NullPointerException if any of the arrays passed is {@code null}. + * @throws IllegalArgumentException if the arrays passed have incompatible sizes. + */ + public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) { + // TODO This is a copy&paste from PairHMM*Engine read data preparation code. + // TODO It is simply to difficult to share the code without changing that class and I don't want + // TODO to do so for now. + if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length"); + if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length"); + if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length"); + maxToCol = 0; + + if (readBases.length > readCapacity) { + readCapacity = readBases.length; + initialize(readCapacity,haplotypeCapacity); + } + paddedReadLength = readBases.length + 1; + final byte[] overallGCP = new byte[readBases.length]; + Arrays.fill(overallGCP, constantGCP); // Is there a way to derive + + for (int kkk = 0; kkk < readQuals.length; kkk++) { + readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], + mq); // cap base quality by mapping + readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE + : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); + readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); + readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); + } + this.readBases = readBases; + this.readQuals = readQuals; + this.readInsQuals = readInsQuals; + this.readDelQuals = readDelQuals; + this.readGepQuals = overallGCP; + initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP); + if (haplotypeBases != null) + fillPriorsTable(0); + cachedResults.clear(); + } + + @Override + public void loadHaplotypeBases(final byte[] haplotypeBases) { + if (readBases == null) + throw new IllegalStateException( + "no read was loaded before the haplotype"); + this.haplotypeBases = haplotypeBases.clone(); + haplotypeLength = haplotypeBases.length; + paddedHaplotypeLength = haplotypeLength; + if (haplotypeCapacity < haplotypeLength) { + haplotypeCapacity = haplotypeLength; + initialize(readCapacity,haplotypeCapacity); + initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); + } + initializePriors(this.haplotypeBases, readBases, readQuals, 0); + } + + + /** + * Changes only the suffix of the currently loaded haplotype. + * + *

+ * If from is 0, this is equivalent to call {@link #loadHaplotypeBases(byte[])} directly. + *

+ * @param from first position on the current haplotype to substitute with the new suffix. + * It can be up to the length of the haplotype in such case this operation is in + * effect just extending that haplotype. + * @param suffix the new bases for the end part of the current haplotype. + * @param suffixFrom inclusive first position of the actual suffix within the {@code suffix} array. + * @param suffixTo exclusive last position of the actual suffix within the {@code suffix} array. + * + * @throws IllegalStateException if no read was loaded with {@link #loadRead}. + * @throws IllegalArgumentException if from is more than 0 but no haplotype was loaded previously or if indices passed are inconsistent. + * @throws ArrayIndexOutOfBoundsException if indices passed are outside valid ranges. + */ + public void changeHaplotypeSuffix(final int from, final byte[] suffix, final int suffixFrom, final int suffixTo) { + if (readBases == null) + throw new IllegalStateException( + "no read was loaded before the haplotype"); + if (haplotypeBases == null && from > 0) + throw new IllegalArgumentException("from cannot be larger than 0 if no haplotype bases was previously loaded"); + if (suffixFrom < 0) + throw new ArrayIndexOutOfBoundsException("the suffix from index cannot be negative"); + if (suffixTo > suffix.length) + throw new ArrayIndexOutOfBoundsException("the suffix to index cannot be larger than the suffix array length"); + if (suffixFrom > suffixTo) + throw new IllegalArgumentException("the suffix to index cannot be smaller than the suffix from index"); + if (from > haplotypeLength) + throw new IllegalArgumentException("the from index cannot be greater than the current haplotype length"); + if (from < 0) + throw new IllegalArgumentException("the from index cannot be negative"); + + int startIndex = from; + if (haplotypeBases == null) { + haplotypeBases = Arrays.copyOfRange(suffix,suffixFrom,suffixTo); + haplotypeLength = suffixTo - suffixFrom; + } else { + final int newLength = from + suffixTo - suffixFrom; + if (haplotypeBases.length < newLength) + haplotypeBases = Arrays.copyOf(haplotypeBases,newLength); + System.arraycopy(suffix,suffixFrom,haplotypeBases,from,newLength - from); + haplotypeLength = newLength; + } + paddedHaplotypeLength = haplotypeLength + 1; + if (haplotypeCapacity < haplotypeLength) { + haplotypeCapacity = haplotypeLength; + initialize(readCapacity,haplotypeCapacity); + initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); + startIndex = 0; + } + //startIndex = 0; + fillPriorsTable(startIndex); + } + + /** + * Returns the bases of the current haplotype. + * + * @throws IllegalStateException if no haplotype was loaded previously + * @return never {@code null} + */ + public byte[] getHaplotypeBases() { + if (haplotypeBases == null) + throw new IllegalStateException(); + return Arrays.copyOfRange(haplotypeBases,0,haplotypeLength); + } + + /** + * Returns a debug representation of the pair-hmm. + * @return never {@code null}. + */ + public String toString() { + return "" + haplotypeLength + ":" + new String(Arrays.copyOfRange(haplotypeBases,0,haplotypeLength)); + } + + @Override + protected void initializePriors(final byte[] hapBases, final byte[] readBases, final byte[] baseQuals, final int idx) { + haplotypeBases = hapBases; + haplotypeLength = haplotypeBases.length; + this.readBases = readBases; + this.readQuals = baseQuals; + fillPriorsTable(idx); + } + + /** + * Fills the prior table up. + * + *

+ * It accepts an argument to save unnecessary prefix filling up. + *

+ * + * @param idx first position in the haplotype to start filling from. + */ + protected void fillPriorsTable(final int idx) { + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = idx; j < haplotypeLength; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + + /** + * Decorates haplotype set with their likelihoods as compared with the currently loaded read. + * + * + * @param readStart inclusive start position of the targeted section of the read. + * @param readEnd exclusive end position just beyond the targeted section of the read. + * @param haplotypes in/out set of haplotypes. + */ + public void calculateLocalLikelihoods(final int readStart, final int readEnd, final PairHMMReadyHaplotypes haplotypes) { + final PairHMMReadyHaplotypes.Iterator entryIterator = haplotypes.iterator(); + boolean isFirst = true; + while (entryIterator.hasNext()) { + entryIterator.next(); + final int startIndex = entryIterator.startIndex(); + final byte[] bases = entryIterator.bases(); + changeHaplotypeSuffix(startIndex,bases,startIndex,bases.length); + final double likelihood = calculateLikelihood(readStart, readEnd, startIndex, isFirst); + isFirst = false; + entryIterator.setLikelihood(likelihood); + } + } + + + + @Override + public double calculateLocalLikelihood(final int readStart, final int readEnd, + final int hapStart, final int hapEnd, final boolean kmerMatch) { + if (readBases == null || haplotypeBases == null) + throw new IllegalStateException("read or haplotype was not loaded"); + final int hapSegmentLength = hapEnd - hapStart; + final int readSegmentLength = readEnd - readStart; + // trivial case when there is a single base match. + if (kmerMatch) { + return calculateLocalLikelihoodsExactMatch(readStart, hapStart, hapSegmentLength, readSegmentLength); + } else if (hapSegmentLength == readSegmentLength) { + if (hapSegmentLength == 0) { + return calculateLocalLikelihoodEmptySquare(readStart, readEnd); + } else if (hapSegmentLength == 1) { + return calculateLocalLikelihoodSingleBase(readStart, readEnd, hapStart); + } else { // general (slower) solution. + return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); + } + } else if (hapSegmentLength == 0) { // must be full insertion we + return calculateLocalLikelihoodInsertion(readStart, readEnd); + } else if (readSegmentLength == 0) { // full deletion. + return calculateLocalLikelihoodDeletion(readStart, hapStart, hapEnd); + } else { // general (slower) solution. + return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); + } + } + + /** + * Fast likelihood when the pair-hmm represents a deletion in the read. + */ + private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) { + double result = INITIAL_CONDITION; + if (readStart > 0) { // no penalty if at the beginning. + result *= transition[readStart][matchToDeletion]; + result *= + StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1); + result *= transition[readStart][indelToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + + /** + * Fast likelihood when the pair-hmm represents a insertion in the read. + */ + private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) { + double result = INITIAL_CONDITION; + result *= transition[readStart + 1][matchToInsertion]; + for (int i = readStart + 1; i < readEnd; i++) { + result *= transition[i + 1][insertionToInsertion]; + } + if (readEnd < readBases.length) { + result *= transition[readEnd + 1][indelToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Single base mismatch fast likelihood calculation. + */ + private double calculateLocalLikelihoodSingleBase(final int readStart, final int readEnd, final int hapStart) { + double result = INITIAL_CONDITION; + result *= prior[readStart + 1][hapStart + 1]; + if (readStart > 0) { + result *= transition[readStart + 1][matchToMatch]; + } + if (readEnd < readBases.length) { + result *= transition[readEnd + 1][matchToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Empty square Pair-hmm. + */ + private double calculateLocalLikelihoodEmptySquare(final int readStart, final int readEnd) { + double result = INITIAL_CONDITION; + if (readStart > 0 && readEnd < readBases.length) { + result *= transition[readStart + 1][matchToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Likelihood assuming that there is a exact match between both sequences: read and haplotype + */ + private double calculateLocalLikelihoodsExactMatch(final int readStart, final int hapStart, final int hapSegmentLength, final int readSegmentLength) { + double result = INITIAL_CONDITION; + if (hapSegmentLength == 1) { + result *= prior[readStart + 1][hapStart + 1]; + } else { + for (int i = 0; i < readSegmentLength; i++) { + result *= prior[readStart + i + 1][hapStart + i + 1]; + if (i > 0) { + result *= transition[readStart + i + 1][matchToMatch]; + } + } + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Revert to a general pair-hmm solution. + */ + private double calculateLocalLikelihoodsGeneral(final int readStart, final int readEnd, final int hapStart, final int hapEnd) { + final Problem p = new Problem(readStart, readEnd, hapStart, hapEnd); + final Double cachedCost = cachedResults.get(p); + if (cachedCost != null) { + return cachedCost; + } + double cost = calculateLocalLikelihoodGeneral(p); + cachedResults.put(p, cost); + return cost; + } + + /** + * Resolve the regular full pair-hmm. + * + *

+ * With the possibility of reuse the previous haplotype common prefix by using + * a startIndex which is greater than 0. + */ + private double calculateLikelihood(final int readStart, final int readEnd, final int startIndex, final boolean initializeEdges) { + final int edgeStart = initializeEdges ? 0 : startIndex + 1; + initializeMatrixValuesForTrailingProblem(readStart, readEnd, edgeStart); + updateTable(readStart + 1, readEnd + 1, startIndex + 1, haplotypeLength + 1); + if (readEnd == readBases.length) + return finalLikelihoodCalculation(readEnd,0,haplotypeLength + 1) - (readStart == 0 ? StrictMath.log10(haplotypeLength) : 0); + else { + final double divider = 3.0; + final double dividerInverted = 1.0 / divider; + return StrictMath.log10(matchMatrix[readEnd][haplotypeLength] + * transition[readEnd][matchToMatch] * dividerInverted + + insertionMatrix[readEnd][haplotypeLength] + * transition[readEnd][indelToMatch] * dividerInverted + + deletionMatrix[readEnd][haplotypeLength] + * transition[readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + } + + + private double calculateLocalLikelihoodGeneral(final Problem p) { + + initializeMatrixValues(p,null); + // int fromCol = p.hapStart + 1; + // if (previousProblem == null) { + // fromCol = p.hapStart + 1; + // } else { + // final int sharedPrefix = previousProblem.followerStartIndex(p); + // if (sharedPrefix >= 0) + // fromCol = sharedPrefix + 1; + // else + // fromCol = p.hapStart + 1; + // } + // previousProblem = p; + + updateTable(p.readStart + 1, p.readEnd + 1, + p.hapStart + 1, p.hapEnd + 1); + + if (p.trailing) { + return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1) + - (p.leading ? StrictMath.log10(p.hapEnd - p.hapStart) : 0); + } else { + final double divider = 3.0; + final double dividerInverted = 1.0 / divider; + return StrictMath.log10(matchMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][matchToMatch] * dividerInverted + + insertionMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][indelToMatch] * dividerInverted + + deletionMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + } + + private void updateTable(final int rowFrom, final int rowTo, + final int colFrom, final int colTo) { + + for (int i = rowFrom; i < rowTo; i++) { + for (int j = colFrom; j < colTo; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + } + + /** + * Holds the properties of a pair-hmm computational problem. + */ + public class Problem { + private final byte[] haplotypeSegment; + private final int readStart; + private final int readEnd; + private final int hapStart; + private final int hapEnd; + private final int hashCode; + private final boolean trailing; + private final boolean leading; + + /** + * Construct a new project object. + * @param start inclusive start position on the read to consider. + * @param end exclusive after last position on the read to consider. + * @param hapStart inclusive start position on the haplotype to consider. + * @param hapEnd exclusive after last position on the haplotype to consider. + */ + public Problem(final int start, final int end, final int hapStart, + final int hapEnd) { + if (start < 0 || start > readBases.length) + throw new IllegalArgumentException("bad start index " + start); + if (end < start || end > readBases.length) + throw new IllegalArgumentException("bad end index " + end + " < " + start + " or " + end + " > " + readBases.length); + if (hapStart < 0 || hapStart > haplotypeLength) + throw new IllegalArgumentException("bad hap start index " + + hapStart + " is larger than the haplotypeLength " + haplotypeLength); + if (hapEnd < hapStart || hapEnd > haplotypeLength) + throw new IllegalArgumentException("bad hap end index " + + hapEnd + " outside [" + hapStart + "," + + haplotypeLength + "]"); + + haplotypeSegment = Arrays.copyOfRange(haplotypeBases, hapStart, hapEnd); + readStart = start; + readEnd = end; + this.hapStart = hapStart; + this.hapEnd = hapEnd; + trailing = readEnd == readBases.length; + leading = readStart == 0; + + hashCode = ((start * 31 + end) * 31 + Arrays.hashCode(haplotypeSegment) * 31); + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public boolean equals(Object o) { + if (o == this) + return true; + else if (o == null) + return false; + else if (o.getClass() != this.getClass()) + return false; + else { + final Problem p = (Problem) o; + return (p.hashCode == this.hashCode) && (p.readStart == this.readStart) && (p.readEnd == this.readEnd) && Arrays.equals(haplotypeSegment, p.haplotypeSegment); + } + } + + + } + + /** + * Returns the currently loaded read base calls. + * @return {@code never null}. + */ + public byte[] getReadBases() { + if (readBases == null) + throw new IllegalStateException("no read was previously loaded."); + return readBases; + } + + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java new file mode 100644 index 000000000..152274947 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java @@ -0,0 +1,105 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * API for the fast (partial) HMM calculation engine. + */ +public interface FlexibleHMM { + + /** + * Load a read into the HMM calculation matrices. + * @param read the read record to load into the HMM calculating engine. + */ + public void loadRead(GATKSAMRecord read); + + /** + * Returns the current read bases. + * + * @return never null. + */ + public byte[] getReadBases(); + + /** + * Loads a haplotype bases in the HMM calculation matrices. + * @param haplotype the haplotype sequence. + * + * @throws IllegalStateException if no read has been previously loaded. + * @throws NullPointerException if {@code haplotype} is {@code null}. + */ + public void loadHaplotypeBases(byte[] haplotype); + + /** + * Resolve the partial Fast PairHMM for a section of the read and haplotype + * @param readFrom inclusive offset of the first position on the read. + * @param readTo exclusive offset of the last position on the read. + * @param haplotypeFrom inclusive offset of the first position on the haplotype. + * @param haplotypeTo exclusive offset of the last position on the haplotype. + * @param treatAsMatch can assume that both pieces are the same sequence. + * @return the cost the sub-HMM. + */ + public double calculateLocalLikelihood(int readFrom, int readTo, int haplotypeFrom, int haplotypeTo, boolean treatAsMatch); + + /** + * Load a read given its relevant information pieces by separate. + * @param bases read bases. + * @param bq base qualities. + * @param iq insertion qualities. + * @param dq deletion qualities. + * @param mq read mapping quality. + */ + public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq); + + + /** + * Returns the constant gap extension penalty in Phred scale + * @return never @code null. + */ + byte getGapExtensionPenalty(); +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java new file mode 100644 index 000000000..ed35e6970 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -0,0 +1,180 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ +public class LoglessPairHMM extends N2MemoryPairHMM { + protected static final double INITIAL_CONDITION = Math.pow(2, 1020); + protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + + + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + final double initialValue = INITIAL_CONDITION / haplotypeBases.length; + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; + } + } + + if ( ! constantsAreInitialized || recacheReadValues ) { + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + + // note that we initialized the constants + constantsAreInitialized = true; + } + + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + for (int i = 1; i < paddedReadLength; i++) { + // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + final int endI = paddedReadLength - 1; + double finalSumProbabilities = 0.0; + for (int j = 1; j < paddedHaplotypeLength; j++) { + finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; + } + return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + protected void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + PairHMMModel.qualToTransProbs(transition,insertionGOP,deletionGOP,overallGCP); + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { + + matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] + + insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] + + deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] ); + insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion]; + deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion]; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java new file mode 100644 index 000000000..6efed2689 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -0,0 +1,170 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.LRUCache; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class ReadCovariates { + private final static Logger logger = Logger.getLogger(ReadCovariates.class); + + /** + * How big should we let the LRU cache grow + */ + private static final int LRU_CACHE_SIZE = 500; + + /** + * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. + * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU + * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. + * + * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE + */ + private final static ThreadLocal> keysCache = new ThreadLocal>() { + @Override protected LRUCache initialValue() { + return new LRUCache(LRU_CACHE_SIZE); + } + }; + + /** + * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis. + * If running multiple analyses (or the unit test suite), it's necessary to clear the cache. + */ + public static void clearKeysCache() { + keysCache.remove(); + } + + /** + * Our keys, indexed by event type x read length x covariate + */ + private final int[][][] keys; + + /** + * The index of the current covariate, used by addCovariate + */ + private int currentCovariateIndex = 0; + + public ReadCovariates(final int readLength, final int numberOfCovariates) { + final LRUCache cache = keysCache.get(); + final int[][][] cachedKeys = cache.get(readLength); + if ( cachedKeys == null ) { + // There's no cached value for read length so we need to create a new int[][][] array + if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); + keys = new int[EventType.values().length][readLength][numberOfCovariates]; + cache.put(readLength, keys); + } else { + keys = cachedKeys; + } + } + + public void setCovariateIndex(final int index) { + currentCovariateIndex = index; + } + + /** + * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset + * + * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases + * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently + * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs. + * + * @param mismatch the mismatch key value + * @param insertion the insertion key value + * @param deletion the deletion key value + * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates + */ + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; + } + + /** + * Get the keys for all covariates at read position for error model + * + * @param readPosition + * @param errorModel + * @return + */ + public int[] getKeySet(final int readPosition, final EventType errorModel) { + return keys[errorModel.ordinal()][readPosition]; + } + + public int[][] getKeySet(final EventType errorModel) { + return keys[errorModel.ordinal()]; + } + + // ---------------------------------------------------------------------- + // + // routines for testing + // + // ---------------------------------------------------------------------- + + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } + + protected int[] getMismatchesKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); + } + + protected int[] getInsertionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_INSERTION); + } + + protected int[] getDeletionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_DELETION); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatum.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatum.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java new file mode 100644 index 000000000..325237d05 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -0,0 +1,1064 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.R.RScriptExecutor; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.Resource; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 6, 2009 + * + * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. + * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. + * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. + */ + +public class RecalUtils { + public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; + public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; + public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; + public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; + public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; + + public final static String ARGUMENT_COLUMN_NAME = "Argument"; + public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; + public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; + public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; + public final static String READGROUP_COLUMN_NAME = "ReadGroup"; + public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; + public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; + public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; + public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; + public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; + public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; + public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; + public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; + + private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private static boolean warnUserNullPlatform = false; + + private static final String SCRIPT_FILE = "BQSR.R"; + + private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); + private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); + private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); + private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); + + /** + * Generates two lists : required covariates and optional covariates based on the user's requests. + * + * Performs the following tasks in order: + * 1. Adds all requierd covariates in order + * 2. Check if the user asked to use the standard covariates and adds them all if that's the case + * 3. Adds all covariates requested by the user that were not already added by the two previous steps + * + * @param argumentCollection the argument collection object for the recalibration walker + * @return a pair of ordered lists : required covariates (first) and optional covariates (second) + */ + public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); + + final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + ArrayList optionalCovariates = new ArrayList(); + if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + + // parse the -cov arguments that were provided, skipping over the ones already specified + if (argumentCollection.COVARIATES != null) { + for (String requestedCovariateString : argumentCollection.COVARIATES) { + // help the transition from BQSR v1 to BQSR v2 + if ( requestedCovariateString.equals("DinucCovariate") ) + throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + + "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + + "as well as an indel context to model the indel error rates"); + + boolean foundClass = false; + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + foundClass = true; + if (!requiredClasses.contains(covClass) && + (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { + try { + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + optionalCovariates.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } + } + + if (!foundClass) { + throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); + } + } + } + return new Pair, ArrayList>(requiredCovariates, optionalCovariates); + } + + /** + * Adds the required covariates to a covariate list + * + * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addRequiredCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + if (classes.size() != 2) + throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); + + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new QualityScoreCovariate()); + return dest; + } + + /** + * Adds the standard covariates to a covariate list + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addStandardCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + for (Class covClass : classes) { + try { + final Covariate covariate = (Covariate) covClass.newInstance(); + dest.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + return dest; + } + + /** + * Print a list of all available covariates to logger as info + * + * @param logger + */ + public static void listAvailableCovariates(final Logger logger) { + logger.info("Available covariates:"); + for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { + logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); + } + } + + /** + * Component used to print out csv representation of the reports that can be use to perform analysis in + * external tools. E.g. generate plots using R scripts. + *

+ * A header is always printed into the output stream (or file) when the printer is created. Then you only need + * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file. + * Once finished, you close the printer calling {@link #close() close} + * + */ + private static class CsvPrinter { + + private final PrintStream ps; + private final Covariate[] covariates; + + /** + * Constructs a printer redirected to an output file. + * @param out the output file. + * @param c covariates to print out. + * @throws FileNotFoundException if the file could not be created anew. + */ + protected CsvPrinter(final File out, final Covariate ... c) + throws FileNotFoundException { + this(new FileOutputStream(out), c); + } + + /** + * Constructs a printer redirected to an output stream + * @param os the output. + * @param c covariates to print out. + */ + protected CsvPrinter(final OutputStream os, final Covariate ... c) { + covariates = c == null ? new Covariate[0] : c.clone(); + ps = new PrintStream(os); + printHeader(); + } + + /** + * Prints the header out. + *

+ * Should only be invoked at creation. + */ + protected void printHeader() { + RecalUtils.printHeader(ps); + } + + /** + * Prints out a report into the csv file. + * + * + * @param report the report to print out. + * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED + */ + public void print(final RecalibrationReport report, final String mode) { + RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false); + } + + /** + * Close the csv printer. + * + * No further output will be allowed or take place after calling this method. + */ + public void close() { + ps.close(); + } + + } + + /** + * Returns a csv output printer. + * + * @param out the output file. It will be overridden + * @param c list of covariates to print out. + * + * @throws FileNotFoundException if out could not be created anew. + * + * @return never null + */ + protected static CsvPrinter csvPrinter(final File out, final Covariate ... c) + throws FileNotFoundException + { + if (c == null) { + throw new IllegalArgumentException("the input covariate array cannot be null"); + } + return new CsvPrinter(out,c); + } + + /** + * Prints out a collection of reports into a file in Csv format in a way + * that can be used by R scripts (such as the plot generator script). + *

+ * The set of covariates is take as the minimum common set from all reports. + * + * @param out the output file. It will be overridden. + * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) + * of each report and the corresponding value the report itself. + * @throws FileNotFoundException if out could not be created anew. + */ + public static void generateCsv(final File out, final Map reports) + throws FileNotFoundException { + if (reports.size() == 0) { + writeCsv(out, reports, new Covariate[0]); + } else { + final Iterator rit = reports.values().iterator(); + final RecalibrationReport first = rit.next(); + final Covariate[] firstCovariates = first.getRequestedCovariates(); + final Set covariates = new LinkedHashSet<>(); + Utils.addAll(covariates,firstCovariates); + while (rit.hasNext() && covariates.size() > 0) { + final Covariate[] nextCovariates = rit.next().getRequestedCovariates(); + final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length); + for (final Covariate nc : nextCovariates) { + nextCovariateNames.add(nc.getClass().getSimpleName()); + } + final Iterator cit = covariates.iterator(); + while (cit.hasNext()) { + if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) { + cit.remove(); + } + } + } + writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()])); + } + } + + /** + * Print out a collection of reports into a file in Csv format in a way + * that can be used by R scripts (such as the plot generator script). + * + * @param out + * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) + * of each report and the corresponding value the report itself. + * @param c the covariates to print out. + * @throws FileNotFoundException if out could not be created anew. + */ + private static void writeCsv(final File out, + final Map reports, final Covariate[] c) + throws FileNotFoundException { + final CsvPrinter p = csvPrinter(out,c); + for (Map.Entry e : reports.entrySet()) { + p.print(e.getValue(),e.getKey()); + } + p.close(); + } + + public enum SOLID_RECAL_MODE { + /** + * Treat reference inserted bases as reference matching bases. Very unsafe! + */ + DO_NOTHING, + /** + * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. + */ + SET_Q_ZERO, + /** + * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. + */ + SET_Q_ZERO_BASE_N, + /** + * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. + */ + REMOVE_REF_BIAS; + + public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { + if (recalMode.equals("DO_NOTHING")) + return SOLID_RECAL_MODE.DO_NOTHING; + if (recalMode.equals("SET_Q_ZERO")) + return SOLID_RECAL_MODE.SET_Q_ZERO; + if (recalMode.equals("SET_Q_ZERO_BASE_N")) + return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; + if (recalMode.equals("REMOVE_REF_BIAS")) + return SOLID_RECAL_MODE.REMOVE_REF_BIAS; + + throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); + } + } + + public enum SOLID_NOCALL_STRATEGY { + /** + * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. + */ + THROW_EXCEPTION, + /** + * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. + */ + LEAVE_READ_UNRECALIBRATED, + /** + * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. + */ + PURGE_READ; + + public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { + if (nocallStrategy.equals("THROW_EXCEPTION")) + return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) + return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; + if (nocallStrategy.equals("PURGE_READ")) + return SOLID_NOCALL_STRATEGY.PURGE_READ; + + throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); + } + } + + private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + List result = new LinkedList(); + int reportTableIndex = 0; + int rowIndex = 0; + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { + + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + columnNames.add(covariateValue); + columnNames.add(covariateName); + } + } + + columnNames.add(eventType); // the order of these column names is important here + columnNames.add(empiricalQuality); + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) + columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported + columnNames.add(nObservations); + columnNames.add(nErrors); + + final GATKReportTable reportTable; + if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + if(sortByCols) { + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT); + } + for (final Pair columnName : columnNames) + reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); + rowIndex = 0; // reset the row index since we're starting with a new table + } else { + reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); + } + + final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); + for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) { + final RecalDatum datum = (RecalDatum)row.value; + final int[] keys = row.keys; + + int columnIndex = 0; + int keyIndex = 0; + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + final Covariate covariate = requestedCovariates[tableIndex]; + + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); + } + } + + final EventType event = EventType.eventFrom(keys[keyIndex]); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); + + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); + reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); + + rowIndex++; + } + result.add(reportTable); + } + + return result; + } + + private static String parseCovariateName(final Covariate covariate) { + return covariate.getClass().getSimpleName().split("Covariate")[0]; + } + + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE); + } + + /** + * Return a human-readable string representing the used covariates + * + * @param requestedCovariates a vector of covariates + * @return a non-null comma-separated string + */ + public static String covariateNames(final Covariate[] requestedCovariates) { + final List names = new ArrayList(requestedCovariates.length); + for ( final Covariate cov : requestedCovariates ) + names.add(cov.getClass().getSimpleName()); + return Utils.join(",", names); + } + + public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) { + outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile); + } + + private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables, final PrintStream outputFile) { + final GATKReport report = new GATKReport(); + report.addTable(argumentTable); + report.addTable(quantizationTable); + report.addTables(recalTables); + report.print(outputFile); + } + + /** s + * Write recalibration plots into a file + * + * @param csvFile location of the intermediary file + * @param exampleReportFile where the report arguments are collected from. + * @param output result plot file name. + */ + public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) { + final RScriptExecutor executor = new RScriptExecutor(); + executor.setExceptOnError(true); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); + executor.addArgs(csvFile.getAbsolutePath()); + executor.addArgs(exampleReportFile.getAbsolutePath()); + executor.addArgs(output.getAbsolutePath()); + Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine()); + executor.exec(); + } + + private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) { + + final RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); + executor.addArgs(csvFile.getAbsolutePath()); + executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); + executor.exec(); + } + + /** + * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. + * + * @deprecated + */ + @Deprecated + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { + generateRecalibrationPlot(RAC, original, null, requestedCovariates); + } + + /** + * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. + * + * @deprecated + */ + @Deprecated + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { + final PrintStream csvStream; + final File csvTempFile = null; + try { + File csvTmpFile = File.createTempFile("BQSR",".csv"); + csvTmpFile.deleteOnExit(); + csvStream = new PrintStream(csvTmpFile); + } catch (IOException e) { + throw new UserException("Could not create temporary csv file", e); + } + + if ( recalibrated != null ) + writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null); + csvStream.close(); + outputRecalibrationPlot(csvTempFile, RAC); + csvTempFile.delete(); + } + + private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { + + final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); + + // add the quality score table to the delta table + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + final int[] newCovs = new int[4]; + newCovs[0] = leaf.keys[0]; + newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs[2] = leaf.keys[1]; + newCovs[3] = leaf.keys[2]; + addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table + } + + // add the optional covariates to the delta table + for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { + final int[] covs = new int[4]; + covs[0] = leaf.keys[0]; + covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + covs[2] = leaf.keys[2]; + covs[3] = leaf.keys[3]; + addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table + } + } + + // output the csv file + if (printHeader) { + printHeader(deltaTableFile); + } + + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + // print each data line + for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { + final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); + final RecalDatum deltaDatum = leaf.value; + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.stringForCSV()); + deltaTableFile.println("," + recalibrationMode); + } + } + + private static void printHeader(PrintStream out) { + final List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + out.println(Utils.join(",", header)); + } + + /* + * Return an initialized nested integer array with appropriate dimensions for use with the delta tables + * + * @param recalibrationTables the recal tables + * @param numCovariates the total number of covariates being used + * @return a non-null nested integer array + */ + @Requires("recalibrationTables != null && numCovariates > 0") + @Ensures("result != null") + private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { + + final int[] dimensionsForDeltaTable = new int[4]; + + // initialize the dimensions with those of the qual table to start with + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + final int[] dimensionsOfQualTable = qualTable.getDimensions(); + dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups + dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates + dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; + dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; + + // now, update the dimensions based on the optional covariate tables as needed + for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + final int[] dimensionsOfCovTable = covTable.getDimensions(); + dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); + dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); + } + + return new NestedIntegerArray(dimensionsForDeltaTable); + } + + protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { + final List values = new ArrayList(4); + values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); + + final int covariateIndex = keys[1]; + final int covariateKey = keys[2]; + final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; + values.add(covariate.formatKey(covariateKey)); + values.add(covariateNameMap.get(covariate)); + values.add(EventType.eventFrom(keys[3]).prettyPrint()); + + return values; + } + + /** + * Updates the current RecalDatum element in the delta table. + * + * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. + * + * @param deltaTable the delta table + * @param deltaKey the key to the table + * @param recalDatum the recal datum to combine with the accuracyDatum element in the table + */ + private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { + final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + if (deltaDatum == null) + // if we don't have a key yet, create a new one with the same values as the current datum + deltaTable.put(new RecalDatum(recalDatum), deltaKey); + else + // if we do have a datum, combine it with this one + deltaDatum.combine(recalDatum); + } + + /** + * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * + * @param read The read to adjust + * @param RAC The list of shared command line arguments + */ + public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + GATKSAMReadGroupRecord readGroup = read.getReadGroup(); + + if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform(RAC.FORCE_PLATFORM); + } + + if (readGroup.getPlatform() == null) { + if (RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullPlatform) { + Utils.warnUser("The input .bam file contains reads with no platform information. " + + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); + warnUserNullPlatform = true; + } + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); + } + } + } + + /** + * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are + * inconsistent with the color space. If there is a no call in the color space, this method returns false meaning + * this read should be skipped + * + * @param strategy the strategy used for SOLID no calls + * @param read The SAMRecord to parse + * @return true if this read is consistent or false if this read should be skipped + */ + public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { + if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + return true; + + // Haven't calculated the inconsistency array yet for this read + if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpace; + if (attr instanceof String) + colorSpace = ((String) attr).getBytes(); + else + throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + + final boolean badColor = hasNoCallInColorSpace(colorSpace); + if (badColor) { + if (strategy == SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { + return false; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them + } + else if (strategy == SOLID_NOCALL_STRATEGY.PURGE_READ) { + read.setReadFailsVendorQualityCheckFlag(true); + return false; + } + } + + byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + if (read.getReadNegativeStrandFlag()) + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + + final byte[] inconsistency = new byte[readBases.length]; + int i; + byte prevBase = colorSpace[0]; // The sentinel + for (i = 0; i < readBases.length; i++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); + inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); + prevBase = readBases[i]; + } + read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); + } + else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + + else + return false; // otherwise, just skip the read + } + + return true; + } + + private static boolean hasNoCallInColorSpace(final byte[] colorSpace) { + final int length = colorSpace.length; + for (int i = 1; i < length; i++) { // skip the sentinal + final byte color = colorSpace[i]; + if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { + return true; // There is a bad color in this SOLiD read + } + } + + return false; // There aren't any color no calls in this SOLiD read + } + + /** + * Given the base and the color calculate the next base in the sequence + * + * @param read the read + * @param prevBase The base + * @param color The color + * @return The next base in the sequence + */ + private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { + switch (color) { + case '0': + return prevBase; + case '1': + return performColorOne(prevBase); + case '2': + return performColorTwo(prevBase); + case '3': + return performColorThree(prevBase); + default: + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + } + } + + /** + * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality + * + * @param read The read which contains the color space to check against + * @param offset The offset in the read at which to check + * @return Returns true if the base was inconsistent with the color space + */ + public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); + if (attr != null) { + final byte[] inconsistency = (byte[]) attr; + // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! + if (read.getReadNegativeStrandFlag()) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] == (byte) 0; + } + else { // Forward direction + return inconsistency[offset] == (byte) 0; + } + + // This block of code is for if you want to check both the offset and the next base for color space inconsistency + //if( read.getReadNegativeStrandFlag() ) { // Negative direction + // if( offset == 0 ) { + // return inconsistency[0] != 0; + // } else { + // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); + // } + //} else { // Forward direction + // if( offset == inconsistency.length - 1 ) { + // return inconsistency[inconsistency.length - 1] != 0; + // } else { + // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); + // } + //} + + } + else { // No inconsistency array, so nothing is inconsistent + return true; + } + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return a matrix with all the covariates calculated for every base in the read + */ + public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) { + final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length); + computeCovariates(read, requestedCovariates, readCovariates); + return readCovariates; + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @param resultsStorage The object to store the covariate values + */ + public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for (int i = 0; i < requestedCovariates.length; i++) { + resultsStorage.setCovariateIndex(i); + requestedCovariates[i].recordValues(read, resultsStorage); + } + } + + /** + * Perform a certain transversion (A <-> C or G <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transversion of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorOne(byte base) { + switch (base) { + case 'A': + case 'a': + return 'C'; + case 'C': + case 'c': + return 'A'; + case 'G': + case 'g': + return 'T'; + case 'T': + case 't': + return 'G'; + default: + return base; + } + } + + /** + * Perform a transition (A <-> G or C <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transition of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorTwo(byte base) { + switch (base) { + case 'A': + case 'a': + return 'G'; + case 'C': + case 'c': + return 'T'; + case 'G': + case 'g': + return 'A'; + case 'T': + case 't': + return 'C'; + default: + return base; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base. + * + * @param base the base [AaCcGgTt] + * @return the complementary base, or the input base if it's not one of the understood ones + */ + private static byte performColorThree(byte base) { + switch (base) { + case 'A': + case 'a': + return 'T'; + case 'C': + case 'c': + return 'G'; + case 'G': + case 'g': + return 'C'; + case 'T': + case 't': + return 'A'; + default: + return base; + } + } + + /** + * Combines the recalibration data for table1 and table2 into table1 + * + * Note that table1 is the destination, so it is modified + * + * @param table1 the destination table to merge table2 into + * @param table2 the source table to merge into table1 + */ + public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { + if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); + if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); + if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) + throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); + + for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { + final RecalDatum myDatum = table1.get(row.keys); + + if (myDatum == null) + table1.put(row.value, row.keys); + else + myDatum.combine(row.value); + } + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1L, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1L, isError); + } + } + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/WalkerTestIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/WalkerTestIntegrationTest.java new file mode 100644 index 000000000..1e4d6fbf2 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/WalkerTestIntegrationTest.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class WalkerTestIntegrationTest extends WalkerTest { + + public void testBadMD5(String md5) { + WalkerTestSpec spec = new WalkerTestSpec("FAIL", Arrays.asList(md5)); + executeTest("", spec); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testNullMD5() { + testBadMD5(null); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testBadLengthMD5() { + testBadMD5("asdfasdfa"); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testSpacesMD5() { + testBadMD5("1de8e943fbf55246ebd19efa32f22a58 "); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testBadCharMD5() { + testBadMD5("1de8e943fbf55246ebd19efa32f22a5_"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java new file mode 100644 index 000000000..a118a462d --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java @@ -0,0 +1,97 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class QualByDepthUnitTest extends WalkerTest { + + @DataProvider(name = "UsingAD") + public Object[][] makeUsingADData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final Genotype gAC = new GenotypeBuilder("1", AC).DP(10).AD(new int[]{5,5}).make(); + final Genotype gAA = new GenotypeBuilder("2", AA).DP(10).AD(new int[]{10,0}).make(); + final Genotype gACerror = new GenotypeBuilder("3", AC).DP(10).AD(new int[]{9,1}).make(); + final Genotype gGG = new GenotypeBuilder("4", GG).DP(10).AD(new int[]{1,9}).make(); + + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gACerror)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAA, gAC)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC, gACerror)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, ACG).log10PError(-5).genotypes(Arrays.asList(gAA, gAC, gACerror, gGG)).make(), 2.5}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "UsingAD") + public void testUsingAD(final VariantContext vc, final double expectedQD) { + final Map annotatedMap = new QualByDepth().annotate(null, null, null, null, vc, null); + Assert.assertNotNull(annotatedMap, vc.toString()); + final String QD = (String)annotatedMap.get("QD"); + Assert.assertEquals(Double.valueOf(QD), expectedQD, 0.0001); + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java new file mode 100644 index 000000000..b1c280748 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java @@ -0,0 +1,151 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.utils.MannWhitneyU; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class RankSumUnitTest { + + List distribution20, distribution30, distribution20_40; + static final int observations = 100; + + @BeforeClass + public void init() { + distribution20 = new ArrayList<>(observations); + distribution30 = new ArrayList<>(observations); + distribution20_40 = new ArrayList<>(observations); + + final int skew = 3; + makeDistribution(distribution20, 20, skew, observations); + makeDistribution(distribution30, 30, skew, observations); + makeDistribution(distribution20_40, 20, skew, observations/2); + makeDistribution(distribution20_40, 40, skew, observations/2); + + // shuffle the observations + Collections.shuffle(distribution20); + Collections.shuffle(distribution30); + Collections.shuffle(distribution20_40); + } + + private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) { + final int rangeStart = target - skew; + final int rangeEnd = target + skew; + + int current = rangeStart; + for ( int i = 0; i < numObservations; i++ ) { + result.add(current++); + if ( current > rangeEnd ) + current = rangeStart; + } + } + + @DataProvider(name = "DistributionData") + public Object[][] makeDistributionData() { + List tests = new ArrayList(); + + for ( final int numToReduce : Arrays.asList(0, 10, 50, 100) ) { + tests.add(new Object[]{distribution20, distribution20, numToReduce, true, "20-20"}); + tests.add(new Object[]{distribution30, distribution30, numToReduce, true, "30-30"}); + tests.add(new Object[]{distribution20_40, distribution20_40, numToReduce, true, "20/40-20/40"}); + + tests.add(new Object[]{distribution20, distribution30, numToReduce, false, "20-30"}); + tests.add(new Object[]{distribution30, distribution20, numToReduce, false, "30-20"}); + + tests.add(new Object[]{distribution20, distribution20_40, numToReduce, false, "20-20/40"}); + tests.add(new Object[]{distribution30, distribution20_40, numToReduce, true, "30-20/40"}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "DistributionData") + public void testDistribution(final List distribution1, final List distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) { + final MannWhitneyU mannWhitneyU = new MannWhitneyU(true); + + for ( final Integer num : distribution1 ) + mannWhitneyU.add(num, MannWhitneyU.USet.SET1); + + final List dist2 = new ArrayList<>(distribution2); + if ( numToReduceIn2 > 0 ) { + int counts = 0; + int quals = 0; + + for ( int i = 0; i < numToReduceIn2; i++ ) { + counts++; + quals += dist2.remove(0); + } + + final int qual = quals / counts; + for ( int i = 0; i < numToReduceIn2; i++ ) + dist2.add(qual); + } + + for ( final Integer num : dist2 ) + mannWhitneyU.add(num, MannWhitneyU.USet.SET2); + + final Double result = mannWhitneyU.runTwoSidedTest().second; + Assert.assertFalse(Double.isNaN(result)); + + if ( distributionsShouldBeEqual ) { + // TODO -- THIS IS THE FAILURE POINT OF USING REDUCED READS WITH RANK SUM TESTS + if ( numToReduceIn2 >= observations / 2 ) + return; + Assert.assertTrue(result > 0.1, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0))); + } else { + Assert.assertTrue(result < 0.01, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0))); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java new file mode 100644 index 000000000..0ed7eb2e8 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -0,0 +1,394 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Arrays; + +public class VariantAnnotatorIntegrationTest extends WalkerTest { + + final static String REF = b37KGReference; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + + public static String baseTestString() { + return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; + } + + @Test + public void testHasAnnotsNotAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("360610e4990860bb5c45249b8ac31e5b")); + executeTest("test file has annotations, not asking for annotations, #1", spec); + } + + @Test + public void testHasAnnotsNotAsking2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("d69a3c92a0e8f44e09e7377e3eaed4e8")); + executeTest("test file has annotations, not asking for annotations, #2", spec); + } + + @Test + public void testHasAnnotsAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("ff21ad7bb0d6bcabcee6b95d975570fc")); + executeTest("test file has annotations, asking for annotations, #1", spec); + } + + @Test + public void testHasAnnotsAsking2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("cb463a56d0b5bc66940f844e56265c14")); + executeTest("test file has annotations, asking for annotations, #2", spec); + } + + @Test + public void testNoAnnotsNotAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("540a9be8a8cb85b0f675fea1184bf78c")); + executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); + } + + @Test + public void testNoAnnotsNotAsking2() { + // the genotype annotations in this file are actually out of order. If you don't parse the genotypes + // they don't get reordered. It's a good test of the genotype ordering system. + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("f900e65b65ff0f9d9eb0891ef9b28c73")); + executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); + } + + @Test + public void testNoAnnotsAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("d57ca04b4ceb2f25b31bc0cbd88bca6b")); + executeTest("test file doesn't have annotations, asking for annotations, #1", spec); + } + + @Test + public void testNoAnnotsAsking2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("9cc0cf19070d951b1979e069552810f1")); + executeTest("test file doesn't have annotations, asking for annotations, #2", spec); + } + + @Test + public void testExcludeAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("552c2ad9dbfaa85d51d2def93c8229c6")); + executeTest("test exclude annotations", spec); + } + + @Test + public void testOverwritingHeader() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, + Arrays.asList("0ed4c7760f6e7a158b6d743d257300f3")); + executeTest("test overwriting header", spec); + } + + @Test + public void testNoReads() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("1c423b7730b9805e7b885ece924286e0")); + executeTest("not passing it any reads", spec); + } + + @Test + public void testDBTagWithDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); + executeTest("getting DB tag with dbSNP", spec); + } + + @Test + public void testMultipleIdsWithDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, + Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); + executeTest("adding multiple IDs with dbSNP", spec); + } + + @Test + public void testDBTagWithHapMap() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("cc7184263975595a6e2473d153227146")); + executeTest("getting DB tag with HM3", spec); + } + + @Test + public void testDBTagWithTwoComps() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4")); + executeTest("getting DB tag with 2 comps", spec); + } + + @Test + public void testNoQuals() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, + Arrays.asList("aea983adc01cd059193538cc30adc17d")); + executeTest("test file doesn't have QUALs", spec); + } + + @Test + public void testUsingExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); + executeTest("using expression", spec); + } + + @Test + public void testUsingExpressionWithID() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("3de1d1998203518098ffae233f3e2352")); + executeTest("using expression with ID", spec); + } + + @Test + public void testTabixAnnotationsAndParallelism() { + final String MD5 = "99938d1e197b8f10c408cac490a00a62"; + for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, + Arrays.asList(MD5)); + executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); + } + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, + Arrays.asList(MD5)); + + executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec); + } + + @Test + public void testSnpEffAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " --no_cmdline_in_header -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + 1, + Arrays.asList("d9291845ce5a8576898d293a829a05b7") + ); + executeTest("Testing SnpEff annotations", spec); + } + + @Test + public void testSnpEffAnnotationsUnsupportedVersionGATKMode() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + + "--variant " + privateTestDir + "vcf4.1.example.vcf " + + "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_gatk_mode.vcf " + + "-L 1:10001292-10012424", + 1, + Arrays.asList("7352cf23a4d45d3d2bb34ab44a4100ae") + ); + executeTest("Testing SnpEff annotations (unsupported version, GATK mode)", spec); + } + + @Test + public void testSnpEffAnnotationsUnsupportedVersionNoGATKMode() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + + "--variant " + privateTestDir + "vcf4.1.example.vcf " + + "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " + + "-L 1:10001292-10012424", + 1, + UserException.class + ); + executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec); + } + + @Test(enabled = true) + public void testTDTAnnotation() { + final String MD5 = "427dfdc665359b67eff210f909ebf8a2"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing TDT annotation ", spec); + } + + + @Test(enabled = true) + public void testChromosomeCountsPed() { + final String MD5 = "6b5cbedf4a8b3385edf128d81c8a46f2"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing ChromosomeCounts annotation with PED file", spec); + } + + @Test(enabled = true) + public void testInbreedingCoeffPed() { + final String MD5 = "159a771c1deaeffb786097e106943893"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing InbreedingCoeff annotation with PED file", spec); + } + + @Test + public void testStrandBiasBySample() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); + + final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; + final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); + specNoFS.disableShadowBCF(); + final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); + + // confirm that the FisherStrand values are identical for the two pipelines + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("FS")); + Assert.assertTrue(vcAnn.hasAttribute("FS")); + Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } + + @Test + public void testQualByDepth() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0); + + final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth"; + final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList("")); + specNoQD.disableShadowBCF(); + final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("388200d107fb47326df78a971a52698f")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0); + + // confirm that the QD values are present in the new file for all biallelic variants + // QD values won't be identical because some filtered reads are missing during re-annotation + + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("QD")); + Assert.assertTrue(vcAnn.hasAttribute("QD")); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java new file mode 100644 index 000000000..39cf719dd --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -0,0 +1,136 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.List; + +public final class ReadRecalibrationInfoUnitTest extends BaseTest { + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + @DataProvider(name = "InfoProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + for ( final int readLength: Arrays.asList(10, 100, 1000) ) { + for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { + tests.add(new Object[]{readLength, includeIndelErrors}); + } + } + + return tests.toArray(new Object[][]{}); + } + @Test(dataProvider = "InfoProvider") + public void testReadInfo(final int readLength, final boolean includeIndelErrors) { + final ReadCovariates covariates = new ReadCovariates(readLength, 2); + + final byte[] bases = new byte[readLength]; + final byte[] baseQuals = new byte[readLength]; + final byte[] insertionQuals = new byte[readLength]; + final byte[] deletionQuals = new byte[readLength]; + final boolean[] skips = new boolean[readLength]; + final double[] snpErrors = new double[readLength]; + final double[] insertionErrors = new double[readLength]; + final double[] deletionsErrors = new double[readLength]; + for ( int i = 0; i < readLength; i++ ) { + bases[i] = 'A'; + baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); + insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); + deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); + skips[i] = i % 2 == 0; + snpErrors[i] = 1.0 / (i+1); + insertionErrors[i] = 0.5 / (i+1); + deletionsErrors[i] = 0.3 / (i+1); + } + + final EnumMap errors = new EnumMap(EventType.class); + errors.put(EventType.BASE_SUBSTITUTION, snpErrors); + errors.put(EventType.BASE_INSERTION, insertionErrors); + errors.put(EventType.BASE_DELETION, deletionsErrors); + + final EnumMap quals = new EnumMap(EventType.class); + quals.put(EventType.BASE_SUBSTITUTION, baseQuals); + quals.put(EventType.BASE_INSERTION, insertionQuals); + quals.put(EventType.BASE_DELETION, deletionQuals); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); + if ( includeIndelErrors ) { + read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); + read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); + } + + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); + + Assert.assertEquals(info.getCovariatesValues(), covariates); + Assert.assertEquals(info.getRead(), read); + + for ( int i = 0; i < readLength; i++ ) { + Assert.assertEquals(info.skip(i), skips[i]); + for ( final EventType et : EventType.values() ) { + Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); + final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; + Assert.assertEquals(info.getQual(et, i), expectedQual); + } + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java new file mode 100644 index 000000000..8f71c35be --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 4/5/12 + * Time: 11:28 AM + * To change this template use File | Settings | File Templates. + */ +public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTest { + + private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); + + @Test(enabled = true) + public void testSNP_ACS_Pools() { + executor.PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "df0e67c975ef74d593f1c704daab1705"); + } + + @Test(enabled = true) + public void testBOTH_GGA_Pools() { + executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "c2932cc77611f13cc8a14e87d055a8f8"); + } + + @Test(enabled = true) + public void testINDEL_GGA_Pools() { + executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "a0648992f049ed59fab0ef753d2d0c03"); + } + + @Test(enabled = true) + public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "fcfe18bd4c6087b21959d3c31ec177da"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java new file mode 100644 index 000000000..e16ca154f --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -0,0 +1,73 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM; +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS; + +public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { + + private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); + + @Test(enabled = true) + public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","ef7a6ee4ec7e20e5ce28fc50d3362d3d"); + } + + @Test(enabled = true) + public void testMT_SNP_DISCOVERY_sp4() { + executor.PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fc75733fcdd8079e7f7743961a1f36be"); + } + + @Test(enabled = true) + public void testMT_SNP_GGA_sp10() { + executor.PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "86cdfc291f995036658bfc10773db107"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java new file mode 100644 index 000000000..8b8c82ea6 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -0,0 +1,207 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { + + private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing indel caller + // + // -------------------------------------------------------------------------------------------------------------- + // Basic indel testing with SLX data + @Test + public void testSimpleIndels() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + + " -o %s" + + " -L 1:10,000,000-10,500,000", + 1, + Arrays.asList("bb8c1b2e9343c79133d8efb51ec2192e")); + executeTest(String.format("test indel caller in SLX"), spec); + } + + // Basic indel testing with SLX data + @Test + public void testIndelsWithLowMinAlleleCnt() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + + " -o %s" + + " -minIndelCnt 1" + + " -L 1:10,000,000-10,100,000", + 1, + Arrays.asList("9b4ead3da021763704fcb9d80a5ee6ff")); + + executeTest(String.format("test indel caller in SLX with low min allele count"), spec); + } + + @Test + public void testMultiTechnologyIndels() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,500,000", + 1, + Arrays.asList("f5e5148cac1526136f9f2559fe3b49fa")); + + executeTest(String.format("test indel calling, multiple technologies"), spec); + } + + @Test + public void testWithIndelAllelesPassedIn1() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("209db887bfe1aac8bd62544aa8afa2b5")); + executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); + } + + @Test + public void testWithIndelAllelesPassedIn2() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("83b32ea956809654590abd5e0c029d4d")); + executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); + } + + @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes + public void testMultiSampleIndels1() { + // since we're going to test the MD5s with GGA only do one here + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, + Arrays.asList("")); + List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, + Arrays.asList("25815c1968450ddd009b983d65809c50")); + executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); + } + + @Test + public void testGGAwithNoEvidenceInReads() { + final String vcf = "small.indel.test.vcf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); + executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); + } + + @Test + public void testBaseIndelQualityScores() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + + " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" + + " -o %s" + + " -L 20:10,000,000-10,100,000", + 1, + Arrays.asList("8a7966e4b67334bca6083670c5a16b67")); + + executeTest(String.format("test UG with base indel quality scores"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing MinIndelFraction + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("af0b881d0a931f0789706f0289b72a64")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("aa97a7941a861d57a3b746b3f6301eb6")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); + executeTest("test minIndelFraction 1.0", spec); + } + + // No testing of MD5 here, we previously blew up due to a 0 length haplotypes, so we just need to pass + @Test + public void testHaplotype0Length() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", + 0, + Collections.emptyList()); + executeTest("testHaplotype0Length", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java new file mode 100644 index 000000000..ecfda9d8a --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -0,0 +1,385 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.util.BlockCompressedInputStream; +import org.broad.tribble.readers.AsciiLineReader; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class UnifiedGenotyperIntegrationTest extends WalkerTest { + + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing parameters + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testMinBaseQualityScore() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, + Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7")); + executeTest("test min_base_quality_score 26", spec); + } + + @Test + public void testSLOD() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("bc8a4e4ceb46776169b47146805c882a")); + executeTest("test SLOD", spec); + } + + @Test + public void testNDA() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3")); + executeTest("test NDA", spec); + } + + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("21185d9a7519356ba672757f5a522971")); + executeTest("test using comp track", spec); + } + + @Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine + public void testNoCmdLineHeaderStdout() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, + Collections.emptyList()); + executeTest("testNoCmdLineHeaderStdout", spec); + } + + @Test + public void testOutputParameterSitesOnly() { + testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d"); + } + + @Test + public void testOutputParameterAllConfident() { + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b"); + } + + @Test + public void testOutputParameterAllSites() { + testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8"); + } + + private void testOutputParameters(final String args, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + args, 1, + Arrays.asList(md5)); + executeTest(String.format("testParameter[%s]", args), spec); + } + + @Test + public void testConfidence() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, + Arrays.asList("918109938ef355d759dafc3ebb47d8a5")); + executeTest("test confidence 1", spec1); + } + + @Test + public void testNoPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, + Arrays.asList("9ee4f1ee1827a6726bfac1220a6a7c40")); + executeTest("test no prior 1", spec1); + + } + @Test + public void testUserPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, + Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); + executeTest("test user prior 1", spec1); + + } + + @Test + public void emitPLsAtAllSites() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, + Arrays.asList("85dee5da72c4154e130527c4e6329c07")); + // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail + spec1.disableShadowBCF(); + + executeTest("test all site PLs 1", spec1); + + } + // -------------------------------------------------------------------------------------------------------------- + // + // testing heterozygosity + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testHeterozyosity1() { + testHeterozosity( 0.01, "6053106407e09a6aefb78395a0e22ec4" ); + } + + @Test + public void testHeterozyosity2() { + testHeterozosity( 1.0 / 1850, "37666375278259c4d7dc800a0f73c1ca" ); + } + + private void testHeterozosity(final double arg, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 --heterozygosity " + arg, 1, + Arrays.asList(md5)); + executeTest(String.format("test heterozyosity[%s]", arg), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing compressed output + // + // -------------------------------------------------------------------------------------------------------------- + + private final static String COMPRESSED_OUTPUT_MD5 = "c5c6af421cffa12fe6bdaced6cd41dd2"; + + @Test + public void testCompressedOutput() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, + Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); + executeTest("test compressed output", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing parallelization + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testParallelization() { + + // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations + + String md5 = "1f3fad09a63269c36e871e7ee04ebfaa"; + final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, + Arrays.asList(md5)); + executeTest("test parallelization (single thread)", spec1); + + GenomeAnalysisEngine.resetRandomGenerator(); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, + Arrays.asList(md5)); + executeTest("test parallelization (2 threads)", spec2); + + GenomeAnalysisEngine.resetRandomGenerator(); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, + Arrays.asList(md5)); + executeTest("test parallelization (4 threads)", spec3); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing calls with SLX, 454, and SOLID data + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testMultiTechnologies() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,100,000", + 1, + Arrays.asList("630d1dcfb7650a9287d6723c38b0746a")); + + executeTest(String.format("test multiple technologies"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing calls with BAQ + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testCallingWithBAQ() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,100,000" + + " -baq CALCULATE_AS_NECESSARY", + 1, + Arrays.asList("976e88e4accb4436ad9ac97df9477648")); + + executeTest(String.format("test calling with BAQ"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testSnpEffAnnotationRequestedWithoutRodBinding() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + + "-A SnpEff", + 1, + UserException.class); + executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing Ns in CIGAR + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testNsInCigar() { + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, + UserException.UnsupportedCigarOperatorException.class); + + executeTest("test calling on reads with Ns in CIGAR", spec); + } + + @Test(enabled = true) + public void testCompressedVCFOutputWithNT() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T UnifiedGenotyper -R " + b37KGReference + " -I " + + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam" + + " -o %s -L 20:10,000,000-10,100,000 -nt 4", + 1, Arrays.asList("vcf.gz"), Arrays.asList("")); + final File vcf = executeTest("testCompressedVCFOutputWithNT", spec).first.get(0); + final AsciiLineReader reader = new AsciiLineReader(new BlockCompressedInputStream(vcf)); + int nLines = 0; + while ( reader.readLine() != null ) + nLines++; + Assert.assertTrue(nLines > 0); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing only emit samples + // + // -------------------------------------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testOnlyEmitSample() throws Exception { + final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I " + + privateTestDir + "AFR.complex.variants.bam --disableDithering" + + " -o %s -L 20:10,000,000-10,100,000"; + final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList("")); + specAllSamples.disableShadowBCF(); + final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0); + final List allSampleVCs = GATKVCFUtils.readVCF(allSamplesVCF).getSecond(); + + final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList("")); + onlyHG01879.disableShadowBCF(); + final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0); + final List onlyHG01879VCs = GATKVCFUtils.readVCF(onlyHG01879VCF).getSecond(); + + Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size()); + for ( int i = 0; i < allSampleVCs.size(); i++ ) { + final VariantContext allSampleVC = allSampleVCs.get(i); + final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i); + + if ( allSampleVC == null ) { + Assert.assertNull(onlyHG01879VC); + } else { + Assert.assertNotNull(onlyHG01879VC); + + Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true); + + Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart()); + Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr()); + Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd()); + Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters()); + Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles()); + Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes()); + Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual()); + + final Genotype allG = allSampleVC.getGenotype("HG01879"); + final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879"); + Assert.assertEquals(allG.getAD(), onlyG.getAD()); + Assert.assertEquals(allG.getDP(), onlyG.getDP()); + Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles()); + Assert.assertEquals(allG.getPL(), onlyG.getPL()); + Assert.assertEquals(allG.toString(), onlyG.toString()); + } + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java new file mode 100644 index 000000000..29b93e427 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -0,0 +1,126 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ + + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing normal calling + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testMultiSamplePilot1() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, + Arrays.asList("03ff28802a2e06e0a623d9a5df66d237")); + executeTest("test MultiSample Pilot1", spec); + } + + @Test + public void testWithAllelesPassedIn1() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("85d0e5c086dc642d55124f0e88e7326b")); + executeTest("test MultiSample Pilot2 with alleles passed in", spec1); + } + + @Test + public void testWithAllelesPassedIn2() { + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("11783a280df9bf621840c300edd0401a")); + executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); + } + + @Test + public void testSingleSamplePilot2() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, + Arrays.asList("75503fce7521378f8c2170094aff29df")); + executeTest("test SingleSample Pilot2", spec); + } + + @Test + public void testMultipleSNPAlleles() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, + Arrays.asList("eac8b071bd2fa89889d51de8be84624a")); + executeTest("test Multiple SNP alleles", spec); + } + + @Test + public void testBadRead() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + Arrays.asList("d915535c1458733f09f82670092fcab6")); + executeTest("test bad read", spec); + } + + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("7f912aa5166f6ed16166daac1e5c0935")); + executeTest("test reverse trim", spec); + } + + @Test + public void testMismatchedPLs() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, + Arrays.asList("ab22f70f5c65d45f9754e7064e5a152c")); + executeTest("test mismatched PLs", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java new file mode 100644 index 000000000..c9476f7eb --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -0,0 +1,230 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +// SEE private/R/pls.R if you want the truth output for these tests +public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { + @DataProvider(name = "TestCombineGLs") + public Object[][] makeTestCombineGLs() { + List tests = new ArrayList(); + + tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)}); + tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)}); + tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)}); + + // AA AB BB AC BC CC => AA AB+BC CC + tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)}); + tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)}); + + tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); + tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); + + tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)}); + tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)}); + + tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)}); + + return tests.toArray(new Object[][]{}); + } + + private Genotype makePL(final int ... PLs) { + return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); + } + + @Test(enabled = true, dataProvider = "TestCombineGLs") + public void testCombineGLsPrecise(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final Genotype combined = calc.combineGLsPrecise(testg, altIndex, nAlts); + + Assert.assertEquals(combined.getPL(), expected.getPL(), + "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); + } + + @Test(enabled = true, dataProvider = "TestCombineGLs") + public void testCombinePrecise(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final Genotype combined = calc.combineGLsPrecise(testg, altIndex, nAlts); + + Assert.assertEquals(combined.getPL(), expected.getPL(), + "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); + } + + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + @DataProvider(name = "TestMakeAlleleConditionalContexts") + public Object[][] makeTestMakeAlleleConditionalContexts() { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A)); + final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C)); + final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G)); + final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G)); + final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C)); + + final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); + final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); + final Genotype gACcombined = makePL(0, 2, 5); + final Genotype gACcombined2 = makePL(0, 1, 4); + final Genotype gAGcombined = makePL(0, 4, 9); + + // biallelic + tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); + + // tri-allelic + tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())}); + tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())}); + + return tests.toArray(new Object[][]{}); + } + + + @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts") + private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); + + Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size()); + + for ( int i = 0; i < biAllelicVCs.size(); i++ ) { + final VariantContext actual = biAllelicVCs.get(i); + final VariantContext expected = expectedVCs.get(i); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); + + for ( int j = 0; j < actual.getNSamples(); j++ ) + Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(), + "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL())); + } + } + + + @DataProvider(name = "ThetaNTests") + public Object[][] makeThetaNTests() { + List tests = new ArrayList(); + + final List log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0); + + for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) { + for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) { + for ( List permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) { + tests.add(new Object[]{permutations, Math.pow(10, log10pRef)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ThetaNTests") + public void testThetaNTests(final List log10LAlleles, final double pRef) { + // biallelic + final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef}); + + final double log10pNonRef = Math.log10(1-pRef); + + final List originalPriors = new LinkedList(); + final List pNonRefN = new LinkedList(); + for ( int i = 0; i < log10LAlleles.size(); i++ ) { + final double log10LAllele1 = log10LAlleles.get(i); + final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); + originalPriors.add(result1); + pNonRefN.add(log10pNonRef*(i+1)); + } + + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2); + final List thetaNPriors = calc.applyMultiAllelicPriors(originalPriors); + + double prevPosterior = 0.0; + for ( int i = 0; i < log10LAlleles.size(); i++ ) { + final AFCalcResult thetaN = thetaNPriors.get(i); + AFCalcResult orig = null; + for ( final AFCalcResult x : originalPriors ) + if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping())) + orig = x; + + Assert.assertNotNull(orig, "couldn't find original AFCalc"); + + Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6); + Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6); + + Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0()); + prevPosterior = orig.getLog10PosteriorOfAFGT0(); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java new file mode 100644 index 000000000..664afda51 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java @@ -0,0 +1,249 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.RandomDNA; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Tests for {@link AssemblyResultSet}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class AssemblyResultSetUnitTest extends BaseTest +{ + private GenomeLocParser genomeLocParser; + private SAMFileHeader header; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + + @Test + public void testEmptyResultSet() { + final AssemblyResultSet subject = new AssemblyResultSet(); + + Assert.assertEquals(subject.getHaplotypeList().size(), 0); + Assert.assertEquals(subject.getHaplotypeCount(),0); + Assert.assertEquals(subject.getReferenceHaplotype(),null); + Assert.assertEquals(subject.getFullReferenceWithPadding(),null); + Assert.assertEquals(subject.getPaddedReferenceLoc(),null); + Assert.assertEquals(subject.getRegionForGenotyping(),null); + Assert.assertEquals(subject.getUniqueReadThreadingGraph(10),null); + Assert.assertFalse(subject.hasMultipleKmerSizes()); + } + + @Test + public void testAddReferenceHaplotype() { + + final Haplotype ref = new Haplotype("ACGT".getBytes(),true); + ref.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,ref.length() + 1 )); + final AssemblyResultSet subject = new AssemblyResultSet(); + + Assert.assertTrue(subject.add(ref)); + Assert.assertFalse(subject.add(ref)); + + Assert.assertEquals(subject.getReferenceHaplotype(),ref); + Assert.assertEquals(subject.getHaplotypeCount(),1); + Assert.assertEquals(subject.getHaplotypeList().size(),1); + } + + @Test(dataProvider="assemblyResults") + public void testAddManyHaplotypes(final java.util.List assemblyResults, + final java.util.List> haplotypes) { + final AssemblyResultSet subject = new AssemblyResultSet(); + for (int i = 0; i < haplotypes.size(); i++) { + final int haplotypeCountBefore = subject.getHaplotypeCount(); + final java.util.List haplos = haplotypes.get(i); + final AssemblyResult ar = assemblyResults.get(i); + for (final Haplotype h : haplos) { + Assert.assertTrue(subject.add(h, ar)); + Assert.assertFalse(subject.add(h,ar)); + if (h.isReference()) + Assert.assertEquals(subject.getReferenceHaplotype(),h); + } + final int haplotypeCountAfter = subject.getHaplotypeCount(); + Assert.assertEquals(haplos.size(),haplotypeCountAfter - haplotypeCountBefore); + Assert.assertTrue(subject.getMaximumKmerSize() >= ar.getKmerSize()); + Assert.assertTrue(subject.getMinimumKmerSize() <= ar.getKmerSize()); + Assert.assertEquals(subject.getUniqueReadThreadingGraph(ar.getKmerSize()), ar.getThreadingGraph()); + } + } + + @Test(dataProvider="trimmingData") + public void testTrimTo(final Map haplotypesAndResultSets, final ActiveRegion original) { + final AssemblyResultSet subject = new AssemblyResultSet(); + for (final Map.Entry entry : haplotypesAndResultSets.entrySet()) + subject.add(entry.getKey(),entry.getValue()); + subject.setRegionForGenotyping(original); + final GenomeLoc originalLocation = original.getExtendedLoc(); + final int length = originalLocation.size(); + final GenomeLoc newLocation = originalLocation.setStop(originalLocation.setStart(originalLocation,originalLocation.getStart() + length / 2),originalLocation.getStop() - length / 2); + final ActiveRegion newRegion = original.trim(newLocation); + + final Map originalHaplotypesByTrimmed = new HashMap<>(haplotypesAndResultSets.size()); + for (final Haplotype h : haplotypesAndResultSets.keySet()) + originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h); + + final AssemblyResultSet trimmed = subject.trimTo(newRegion); + + Assert.assertFalse(subject.wasTrimmed()); + Assert.assertTrue(trimmed.wasTrimmed()); + + for (final Haplotype h : trimmed.getHaplotypeList()) { + Assert.assertEquals(h.getGenomeLocation(),newLocation); + Assert.assertEquals(h.getBases().length,newLocation.size()); + } + } + + @DataProvider(name="trimmingData") + public Iterator trimmingData() { + final ActiveRegion activeRegion = new ActiveRegion(genomeLocParser.createGenomeLoc("chr1",1000,1100),genomeLocParser,25); + final int length = activeRegion.getExtendedLoc().size(); + final RandomDNA rnd = new RandomDNA(13); // keep it prepoducible by fixing the seed to lucky 13. + final ActiveRegionTestDataSet actd = new ActiveRegionTestDataSet(10,new String(rnd.nextBases(length)),new String[] { + "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]); + + final List haplotypes = actd.haplotypeList(); + for (final Haplotype h : haplotypes) + h.setGenomeLocation(activeRegion.getExtendedLoc()); + + final ReadThreadingGraph rtg = new ReadThreadingGraph(10); + for (final Haplotype h : haplotypes) + rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), h.isReference()); + final SeqGraph seqGraph = rtg.convertToSequenceGraph(); + final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph); + ar.setThreadingGraph(rtg); + final Map result = + new HashMap<>(); + for (final Haplotype h : haplotypes) + result.put(h,ar); + return Collections.singleton(new Object[] {result,activeRegion}).iterator(); + + } + + + + + @DataProvider(name="assemblyResults") + public java.util.Iterator assemblyResults() { + final int size = THREE_KS_GRAPH_AND_HAPLOTYPES.length * (1 + TEN_KS_GRAPH_AND_HAPLOTYPES.length); + final Object[][] result = new Object[size][]; + + for (int i = 0; i < THREE_KS_GRAPH_AND_HAPLOTYPES.length; i++) { + final ReadThreadingGraph rtg = new ReadThreadingGraph((String) THREE_KS_GRAPH_AND_HAPLOTYPES[i][0]); + final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + final Object[] haplotypeStrings = (Object[]) THREE_KS_GRAPH_AND_HAPLOTYPES[i][1]; + final Haplotype[] haplotypes = new Haplotype[haplotypeStrings.length]; + for (int j = 0; j < haplotypeStrings.length; j++) { + haplotypes[j] = new Haplotype(((String)haplotypeStrings[j]).getBytes(),j == 0); + haplotypes[j].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,haplotypes[j].length() + 1)); + } + result[i] = new Object[] { Collections.singletonList(ar),Arrays.asList(Arrays.asList(haplotypes))}; + for (int j = 0; j < TEN_KS_GRAPH_AND_HAPLOTYPES.length; j++) { + final ReadThreadingGraph rtg10 = new ReadThreadingGraph((String) TEN_KS_GRAPH_AND_HAPLOTYPES[j][0]); + final AssemblyResult ar10 = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg10.convertToSequenceGraph()); + ar10.setThreadingGraph(rtg10); + final Object[] haplotypeStrings10 = (Object[]) TEN_KS_GRAPH_AND_HAPLOTYPES[j][1]; + final Haplotype[] haplotype10 = new Haplotype[haplotypeStrings10.length]; + for (int k = 0; k < haplotypeStrings10.length; k++) { + haplotype10[k] = new Haplotype(((String)haplotypeStrings10[k]).getBytes(),false); + haplotype10[k].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1", 1, haplotype10[k].length() + 1)); + } + + result[THREE_KS_GRAPH_AND_HAPLOTYPES.length + i * TEN_KS_GRAPH_AND_HAPLOTYPES.length + j] = new Object[] { Arrays.asList(ar,ar10), + Arrays.asList( Arrays.asList(haplotypes), Arrays.asList(haplotype10)) }; + } + } + return Arrays.asList(result).iterator(); + } + + + private static final Object[][] THREE_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { + {"[ks=3]{REF: ACT}",new Object[] {"ACT"}}, + {"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (3) -> A -> G -> (2) }" + + "{ (1) -> A -> G -> (2) }",new Object[] {"ACTTGA","ACTAGGA","ACTTAGGA"}}, + {"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}", new Object[] {"ACTCG"}} , + {"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + + "{A(1) -> T -> A(2) }", new Object[] {"ACTAGACGT","ACTATACGT"}} , + {"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" + + "{ ACT -> A -> T(2) -> C -> T -> A -> C -> G -> T -> A(1) -> T}", + new Object[] {"ACTATCAGTACGTAT","ACTATCTACGTAT"}} , + {"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}", + new Object[] {"ACTATCAGTACGTAT"}}, + {"[ks=3]{REF: ACT -> A -> T(1) }" + + "{ ACT -> A -> T(1) }", new Object[] {"ACTAT"}}, + {"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ", new Object[] {"TTTACT","TTTAT"}} + }; + + private static final Object[][] TEN_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { + {"[ks=10]{ACTAGTAAAT -> A -> T -> A -> A -> T -> A", new Object[] {"ACTAGTAAATATAATA"}}, + {"[ks=10]{ATAGTAATAA(1) -> A -> C -> T -> A(2) -> C}{ (1) -> C -> C -> C -> A(2) -> C}", + new Object[] {"ATAGTAATAAACTAC","ATAGTAATAACCCAC"}}, + + }; + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java new file mode 100644 index 000000000..57df96475 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -0,0 +1,363 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/15/12 + */ + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Unit tests for GenotypingEngine + */ +public class GenotypingEngineUnitTest extends BaseTest { + + private static ReferenceSequenceFile seq; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + } + + @Test + public void testFindHomVarEventAllelesInSample() { + final List eventAlleles = new ArrayList(); + eventAlleles.add( Allele.create("A", true) ); + eventAlleles.add( Allele.create("C", false) ); + final List haplotypeAlleles = new ArrayList(); + haplotypeAlleles.add( Allele.create("AATA", true) ); + haplotypeAlleles.add( Allele.create("AACA", false) ); + haplotypeAlleles.add( Allele.create("CATA", false) ); + haplotypeAlleles.add( Allele.create("CACA", false) ); + final List haplotypes = new ArrayList(); + haplotypes.add(new Haplotype("AATA".getBytes())); + haplotypes.add(new Haplotype("AACA".getBytes())); + haplotypes.add(new Haplotype("CATA".getBytes())); + haplotypes.add(new Haplotype("CACA".getBytes())); + final List haplotypeAllelesForSample = new ArrayList(); + haplotypeAllelesForSample.add( Allele.create("CATA", false) ); + haplotypeAllelesForSample.add( Allele.create("CACA", false) ); + final List> alleleMapper = new ArrayList>(); + List Aallele = new ArrayList(); + Aallele.add(haplotypes.get(0)); + Aallele.add(haplotypes.get(1)); + List Callele = new ArrayList(); + Callele.add(haplotypes.get(2)); + Callele.add(haplotypes.get(3)); + alleleMapper.add(Aallele); + alleleMapper.add(Callele); + final List eventAllelesForSample = new ArrayList(); + eventAllelesForSample.add( Allele.create("C", false) ); + eventAllelesForSample.add( Allele.create("C", false) ); + + if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { + logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); + logger.warn("expected alleles = " + eventAllelesForSample); + } + Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); + } + + @Test + public void testFindHetEventAllelesInSample() { + final List eventAlleles = new ArrayList(); + eventAlleles.add( Allele.create("A", true) ); + eventAlleles.add( Allele.create("C", false) ); + eventAlleles.add( Allele.create("T", false) ); + final List haplotypeAlleles = new ArrayList(); + haplotypeAlleles.add( Allele.create("AATA", true) ); + haplotypeAlleles.add( Allele.create("AACA", false) ); + haplotypeAlleles.add( Allele.create("CATA", false) ); + haplotypeAlleles.add( Allele.create("CACA", false) ); + haplotypeAlleles.add( Allele.create("TACA", false) ); + haplotypeAlleles.add( Allele.create("TTCA", false) ); + haplotypeAlleles.add( Allele.create("TTTA", false) ); + final List haplotypes = new ArrayList(); + haplotypes.add(new Haplotype("AATA".getBytes())); + haplotypes.add(new Haplotype("AACA".getBytes())); + haplotypes.add(new Haplotype("CATA".getBytes())); + haplotypes.add(new Haplotype("CACA".getBytes())); + haplotypes.add(new Haplotype("TACA".getBytes())); + haplotypes.add(new Haplotype("TTCA".getBytes())); + haplotypes.add(new Haplotype("TTTA".getBytes())); + final List haplotypeAllelesForSample = new ArrayList(); + haplotypeAllelesForSample.add( Allele.create("TTTA", false) ); + haplotypeAllelesForSample.add( Allele.create("AATA", true) ); + final List> alleleMapper = new ArrayList>(); + List Aallele = new ArrayList(); + Aallele.add(haplotypes.get(0)); + Aallele.add(haplotypes.get(1)); + List Callele = new ArrayList(); + Callele.add(haplotypes.get(2)); + Callele.add(haplotypes.get(3)); + List Tallele = new ArrayList(); + Tallele.add(haplotypes.get(4)); + Tallele.add(haplotypes.get(5)); + Tallele.add(haplotypes.get(6)); + alleleMapper.add(Aallele); + alleleMapper.add(Callele); + alleleMapper.add(Tallele); + final List eventAllelesForSample = new ArrayList(); + eventAllelesForSample.add( Allele.create("A", true) ); + eventAllelesForSample.add( Allele.create("T", false) ); + + if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { + logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); + logger.warn("expected alleles = " + eventAllelesForSample); + } + Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); + } + + private boolean compareAlleleLists(List l1, List l2) { + if( l1.size() != l2.size() ) { + return false; // sanity check + } + + for( int i=0; i < l1.size(); i++ ){ + if ( !l2.contains(l1.get(i)) ) + return false; + } + return true; + } + + + private class BasicGenotypingTestProvider extends TestDataProvider { + byte[] ref; + byte[] hap; + Map expected; + + public BasicGenotypingTestProvider(String refString, String hapString, Map expected) { + super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString)); + ref = refString.getBytes(); + hap = hapString.getBytes(); + this.expected = expected; + } + + public Map calcAlignment() { + final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); + final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); + return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); + } + } + + @DataProvider(name = "BasicGenotypingTestProvider") + public Object[][] makeBasicGenotypingTests() { + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(2 + contextSize, (byte)'M'); + map.put(21 + contextSize, (byte)'M'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG", "ATCTCGCATCGCGAGCATCGCCTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'I'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACACTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'D'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); + } + + for( int contextSize : new int[]{1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1, (byte)'M'); + map.put(20, (byte)'D'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider("AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(2 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'I'); + map.put(30 + contextSize, (byte)'D'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "ACCTCGCATCGCGAGCATCGTTACTAGCCGATG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'D'); + map.put(28 + contextSize, (byte)'M'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCCATAG", map); + } + + return BasicGenotypingTestProvider.getTests(BasicGenotypingTestProvider.class); + } + + @Test(dataProvider = "BasicGenotypingTestProvider", enabled = true) + public void testHaplotypeToVCF(BasicGenotypingTestProvider cfg) { + Map calculatedMap = cfg.calcAlignment(); + Map expectedMap = cfg.expected; + logger.warn(String.format("Test: %s", cfg.toString())); + if(!compareVCMaps(calculatedMap, expectedMap)) { + logger.warn("calc map = " + calculatedMap); + logger.warn("expected map = " + expectedMap); + } + Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap)); + } + + @Test(dataProvider="AddMiscellaneousDataProvider", enabled=false) + public void testAddMiscellaneousAllele(final String readBases, final int readOffset, + final String ref, final int refOffset, + final String referenceAllele, final String[] alternatives, final double[] likelihoods, final double[] expected) { + final byte baseQual = (byte)30; + + final byte[] baseQuals = Utils.dupBytes(baseQual, readBases.length()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), baseQuals, readBases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20",0,refOffset,refOffset); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc,Collections.singletonList(read),readOffset); + final VariantContextBuilder vcb = new VariantContextBuilder(); + final GenotypeBuilder gb = new GenotypeBuilder(); + final List alleleStrings = new ArrayList<>( 1 + alternatives.length); + alleleStrings.add(referenceAllele); + alleleStrings.addAll(Arrays.asList(alternatives)); + + gb.AD(new int[] { 1 }); + gb.DP(1); + gb.PL(likelihoods); + + vcb.alleles(alleleStrings); + vcb.loc("20",refOffset,refOffset + referenceAllele.length() -1); + + vcb.genotypes(gb.make()); + + final VariantContext vc = vcb.make(); + + final VariantContext updatedVc = null; // GenotypingEngine.addMiscellaneousAllele(vc,pileup,ref.getBytes(),0); + final GenotypeLikelihoods updatedLikelihoods = updatedVc.getGenotype(0).getLikelihoods(); + Assert.assertEquals(updatedLikelihoods.getAsVector().length, expected.length); + final double[] updatedLikelihoodsArray = updatedVc.getGenotype(0).getLikelihoods().getAsVector(); + for (int i = 0; i < updatedLikelihoodsArray.length; i++) { + Assert.assertEquals(updatedLikelihoodsArray[i],expected[i],0.0001); + } + Allele altAllele = null; + for (final Allele allele : updatedVc.getAlleles()) + if (allele.isSymbolic() && allele.getBaseString().equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME)) + altAllele = allele; + Assert.assertNotNull(altAllele); + } + + @DataProvider(name="AddMiscellaneousDataProvider") + public Iterator addMiscellaneousAlleleDataProvider() { + return Arrays.asList(ADD_MISCELLANEOUS_ALLELE_DATA).iterator(); + } + + private static final double MATCH_LnLK = QualityUtils.qualToProbLog10((byte)30); + private static final double MISS_LnLK = QualityUtils.qualToErrorProbLog10((byte)30); + + private static final Object[][] ADD_MISCELLANEOUS_ALLELE_DATA = new Object[][] { + new Object[] {"ACTG", 0,"ACTGTGAGTATTCC",0,"A",new String[]{}, new double[] {MATCH_LnLK * MATCH_LnLK}, 6, + new double[] {MATCH_LnLK * MATCH_LnLK,MATCH_LnLK * MISS_LnLK, MISS_LnLK * MISS_LnLK}} + }; + + /** + * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext + */ + private boolean compareVCMaps(Map calc, Map expected) { + if( !calc.keySet().equals(expected.keySet()) ) { return false; } // sanity check + for( Integer loc : expected.keySet() ) { + Byte type = expected.get(loc); + switch( type ) { + case 'I': + if( !calc.get(loc).isSimpleInsertion() ) { return false; } + break; + case 'D': + if( !calc.get(loc).isSimpleDeletion() ) { return false; } + break; + case 'M': + if( !(calc.get(loc).isMNP() || calc.get(loc).isSNP()) ) { return false; } + break; + default: + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java new file mode 100644 index 000000000..2838648d5 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM; +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF; + +public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { + + private void HCTestComplexVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleComplex1() { + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7278afd47e5851c954359441cac2f0b8"); + } + + private void HCTestSymbolicVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); + } + + // TODO -- need a better symbolic allele test + @Test + public void testHaplotypeCallerSingleSampleSymbolic() { + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554"); + } + + private void HCTestComplexGGA(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleGGAComplex() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", + "cbdd34c454d69b266e3681ddfc33c7a3"); + } + + @Test + public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", + "f50e0b35e2240b19b1b8b6dfa0cf9796"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java new file mode 100644 index 000000000..8ca67f31d --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -0,0 +1,156 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { + @DataProvider(name = "MyDataProvider") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList<>(); + + final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; + final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7c16aa8e35de9f418533efac3bae6551"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "7e1e193d70187774f9740d475e0f1cc1"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProvider") + public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { + final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); + final Pair,List> executionOutput = executeTest(name, spec); + } + + @Test + public void testERCRegionWithNoCalledHaplotypes() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testERCRegionWithNoCalledHaplotypes", spec); + } + + @Test() + public void testMissingGVCFIndexException() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + @Test() + public void testWrongParameterGVCFIndexException() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + @Test() + public void testWrongTypeGVCFIndexException() { + // ensure non-optimal, if optimal changes + GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK; + if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) + type = GATKVCFIndexType.DYNAMIC_SIZE; + + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + private final static String WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS = privateTestDir + "gvcf_unsorted_records_bug.interval_list"; + private final static String WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM = privateTestDir + "gvcf_unsorted_records_bug.bam"; + + @Test() + public void testWrongGVCFNonVariantRecordOrderBugFix() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("324eb46738a364cd7dc5fa0b62491a5e")); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + private static final String NOCALL_GVCF_BUGFIX_INTERVALS = privateTestDir + "gvcf_nocall_bug.interval_list"; + private static final String NOCALL_GVCF_BUGFIX_BAM = privateTestDir + "gvcf_nocall_bug.bam"; + + @Test + public void testNoCallGVCFMissingPLsBugFix() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("4e2c20650c4c5ae6fa44b289eae5771d")); + spec.disableShadowBCF(); + executeTest("testNoCallGVCFMissingPLsBugFix", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java new file mode 100644 index 000000000..386fc3800 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -0,0 +1,301 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; + +public class HaplotypeCallerIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final static String NA12878_BAM = privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; + final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam"; + final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; + final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; + + private void HCTest(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCaller: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSample() { + HCTest(CEUTRIO_BAM, "", "489073bf0034fe9f10e6472ab93a17eb"); + } + + @Test + public void testHaplotypeCallerSingleSample() { + HCTest(NA12878_BAM, "", "c208ef58d464465c68b5c26501122ad7"); + } + + @Test + public void testHaplotypeCallerMinBaseQuality() { + HCTest(NA12878_BAM, "-mbq 15", "6509cfd0554ecbb92a1b303fbcc0fcca"); + } + + @Test + public void testHaplotypeCallerGraphBasedSingleSample() { + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "049ba1794a1ce2b15566bb1e9431fccf"); + } + + @Test + public void testHaplotypeCallerGraphBasedMultiSample() { + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "d45b2b26434dd3bd48df5a43b3d2954a"); + } + + @Test(enabled = false) // can't annotate the rsID's yet + public void testHaplotypeCallerSingleSampleWithDbsnp() { + HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); + } + + @Test + public void testHaplotypeCallerMultiSampleGGA() { + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "a1e59313516c2d5eeedae8348b0bdff1"); + } + + @Test + public void testHaplotypeCallerInsertionOnEdgeOfContig() { + HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); + } + + private void HCTestIndelQualityScores(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerSingleSampleIndelQualityScores() { + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "d3fc49d3d3c8b6439548133e03faa540"); + } + + private void HCTestNearbySmallIntervals(String bam, String args, String md5) { + try { + final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); + final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); + + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { + if( containsDuplicateRecord(vcf, parser) ) { + throw new IllegalStateException("Duplicate records detected but there should be none."); + } + } + } catch( FileNotFoundException e ) { + throw new IllegalStateException("Could not find the b37 reference file."); + } + } + + private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { + final List> VCs = new ArrayList<>(); + try { + for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { + VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + } + } catch( IOException e ) { + throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); + } + + final Set> VCsAsSet = new HashSet<>(VCs); + return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. + } + + + @Test + public void testHaplotypeCallerNearbySmallIntervals() { + HCTestNearbySmallIntervals(NA12878_BAM, "", "a415bc76231a04dc38412ff38aa0dc49"); + } + + // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper + // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to + // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust + // any of the calls in that region because it is so messy. + @Test + public void HCTestProblematicReadsModifiedInActiveRegions() { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("763d4d8d84a4080db18235a413478660")); + executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); + } + + @Test + public void HCTestStructuralIndels() { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); + executeTest("HCTestStructuralIndels: ", spec); + } + + @Test + public void HCTestDoesNotFailOnBadRefBase() { + // don't care about the output - just want to make sure it doesn't fail + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); + executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); + } + + @Test + public void HCTestDanglingTailMergingForDeletions() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); + + // confirm that the call is the correct one + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertTrue(vc.getReference().basesMatch("ATGTATG")); + Assert.assertTrue(vc.getAlternateAllele(0).basesMatch("A")); + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // test dbSNP annotation + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestDBSNPAnnotationWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("0998be22d7af4372247f5a0338f9446b")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWEx() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("e39c73bbaf22b4751755d9f5bb2a8d3d")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWGSGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("1aeed297a3cb41940d83eac499a2ce07")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWExGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("c14d7f23dedea7e7ec99a90843320c1a")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + @Test + public void HCTestGraphBasedPCRFreePositiveLogLkFix() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 " + , 1, + Arrays.asList("")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // test PCR indel model + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestAggressivePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("f426f4c2986e1dea8f3f55951ef8e013")); + executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); + } + + @Test + public void HCTestConservativePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("dcb38cb9280f2c3059a09d323db1c633")); + executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); + } + + @Test + public void testNoSuchEdgeBugFix() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -dontTrimActiveRegions -ERC GVCF " + + "-likelihoodEngine GraphBased -variant_index_type %s -variant_index_parameter %d", + b37KGReferenceWithDecoy, privateTestDir + "graphbased_no_such_edge_bug.bam", privateTestDir + "graphbased_no_such_edge_bug.intervals.bed", + HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testGraphBasedNoSuchEdgeBugFix", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java new file mode 100644 index 000000000..23513f314 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -0,0 +1,79 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { + @DataProvider(name = "NCTDataProvider") + public Object[][] makeNCTDataProvider() { + List tests = new ArrayList<>(); + + for ( final int nct : Arrays.asList(1, 2, 4) ) { + tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "NCTDataProvider") + public void testHCNCT(final int nct, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec( + "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + + " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, + Arrays.asList(md5)); + executeTest("HC test parallel HC with NCT with nct " + nct, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java new file mode 100644 index 000000000..16a3e9af2 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java @@ -0,0 +1,281 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/14/12 + */ + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Unit tests for PairHMMLikelihoodCalculationEngine + */ +public class PairHMMLikelihoodCalculationEngineUnitTest extends BaseTest { + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + + @BeforeSuite + public void setup() { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + } + + @Test + public void testNormalizeDiploidLikelihoodMatrixFromLog10() { + double[][] likelihoodMatrix = { + {-90.2, 0, 0}, + {-190.1, -2.1, 0}, + {-7.0, -17.5, -35.9} + }; + double[][] normalizedMatrix = { + {-88.1, 0, 0}, + {-188.0, 0.0, 0}, + {-4.9, -15.4, -33.8} + }; + + + Assert.assertTrue(compareDoubleArrays(PairHMMLikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix)); + + double[][] likelihoodMatrix2 = { + {-90.2, 0, 0, 0}, + {-190.1, -2.1, 0, 0}, + {-7.0, -17.5, -35.9, 0}, + {-7.0, -17.5, -35.9, -1000.0}, + }; + double[][] normalizedMatrix2 = { + {-88.1, 0, 0, 0}, + {-188.0, 0.0, 0, 0}, + {-4.9, -15.4, -33.8, 0}, + {-4.9, -15.4, -33.8, -997.9}, + }; + Assert.assertTrue(compareDoubleArrays(PairHMMLikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); + } + + @DataProvider(name = "PcrErrorModelTestProvider") + public Object[][] createPcrErrorModelTestData() { + List tests = new ArrayList(); + + for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) { + for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) { + tests.add(new Object[]{repeat, repeatLength}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PcrErrorModelTestProvider", enabled = true) + public void createPcrErrorModelTest(final String repeat, final int repeatLength) { + + final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte)0, false, + PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, + PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); + + final String readString = Utils.dupString(repeat, repeatLength); + final byte[] insQuals = new byte[readString.length()]; + final byte[] delQuals = new byte[readString.length()]; + Arrays.fill(insQuals, (byte)PairHMMLikelihoodCalculationEngine.INITIAL_QSCORE); + Arrays.fill(delQuals, (byte)PairHMMLikelihoodCalculationEngine.INITIAL_QSCORE); + + engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals); + + final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + repeatCovariate.initialize(PairHMMLikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, PairHMMLikelihoodCalculationEngine.MAX_REPEAT_LENGTH); + + for ( int i = 1; i < insQuals.length; i++ ) { + + final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond(); + final byte adjustedScore = PairHMMLikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0); + + Assert.assertEquals(insQuals[i-1], adjustedScore); + Assert.assertEquals(delQuals[i-1], adjustedScore); + } + } + + /* + private class BasicLikelihoodTestProvider extends TestDataProvider { + public Double readLikelihoodForHaplotype1; + public Double readLikelihoodForHaplotype2; + public Double readLikelihoodForHaplotype3; + + public BasicLikelihoodTestProvider(double a, double b) { + super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f",a,b)); + readLikelihoodForHaplotype1 = a; + readLikelihoodForHaplotype2 = b; + readLikelihoodForHaplotype3 = null; + } + + public BasicLikelihoodTestProvider(double a, double b, double c) { + super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f / %f",a,b,c)); + readLikelihoodForHaplotype1 = a; + readLikelihoodForHaplotype2 = b; + readLikelihoodForHaplotype3 = c; + } + + public double[][] expectedDiploidHaplotypeMatrix() { + if( readLikelihoodForHaplotype3 == null ) { + double maxValue = Math.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2); + double[][] normalizedMatrix = { + {readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY}, + {Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue} + }; + return normalizedMatrix; + } else { + double maxValue = MathUtils.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2,readLikelihoodForHaplotype3); + double[][] normalizedMatrix = { + {readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY}, + {Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue, Double.NEGATIVE_INFINITY}, + {Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, + Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype2) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, readLikelihoodForHaplotype3 - maxValue} + }; + return normalizedMatrix; + } + } + + public double[][] calcDiploidHaplotypeMatrix() { + ArrayList haplotypes = new ArrayList(); + for( int iii = 1; iii <= 3; iii++) { + Double readLikelihood = ( iii == 1 ? readLikelihoodForHaplotype1 : ( iii == 2 ? readLikelihoodForHaplotype2 : readLikelihoodForHaplotype3) ); + int readCount = 1; + if( readLikelihood != null ) { + Haplotype haplotype = new Haplotype( (iii == 1 ? "AAAA" : (iii == 2 ? "CCCC" : "TTTT")).getBytes() ); + haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood}, new int[]{readCount}); + haplotypes.add(haplotype); + } + } + final HashSet sampleSet = new HashSet(1); + sampleSet.add("myTestSample"); + return PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); + } + } + + @DataProvider(name = "BasicLikelihoodTestProvider") + public Object[][] makeBasicLikelihoodTests() { + new BasicLikelihoodTestProvider(-1.1, -2.2); + new BasicLikelihoodTestProvider(-2.2, -1.1); + new BasicLikelihoodTestProvider(-1.1, -1.1); + new BasicLikelihoodTestProvider(-9.7, -15.0); + new BasicLikelihoodTestProvider(-1.1, -2000.2); + new BasicLikelihoodTestProvider(-1000.1, -2.2); + new BasicLikelihoodTestProvider(0, 0); + new BasicLikelihoodTestProvider(-1.1, 0); + new BasicLikelihoodTestProvider(0, -2.2); + new BasicLikelihoodTestProvider(-100.1, -200.2); + + new BasicLikelihoodTestProvider(-1.1, -2.2, 0); + new BasicLikelihoodTestProvider(-2.2, -1.1, 0); + new BasicLikelihoodTestProvider(-1.1, -1.1, 0); + new BasicLikelihoodTestProvider(-9.7, -15.0, 0); + new BasicLikelihoodTestProvider(-1.1, -2000.2, 0); + new BasicLikelihoodTestProvider(-1000.1, -2.2, 0); + new BasicLikelihoodTestProvider(0, 0, 0); + new BasicLikelihoodTestProvider(-1.1, 0, 0); + new BasicLikelihoodTestProvider(0, -2.2, 0); + new BasicLikelihoodTestProvider(-100.1, -200.2, 0); + + new BasicLikelihoodTestProvider(-1.1, -2.2, -12.121); + new BasicLikelihoodTestProvider(-2.2, -1.1, -12.121); + new BasicLikelihoodTestProvider(-1.1, -1.1, -12.121); + new BasicLikelihoodTestProvider(-9.7, -15.0, -12.121); + new BasicLikelihoodTestProvider(-1.1, -2000.2, -12.121); + new BasicLikelihoodTestProvider(-1000.1, -2.2, -12.121); + new BasicLikelihoodTestProvider(0, 0, -12.121); + new BasicLikelihoodTestProvider(-1.1, 0, -12.121); + new BasicLikelihoodTestProvider(0, -2.2, -12.121); + new BasicLikelihoodTestProvider(-100.1, -200.2, -12.121); + + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testOneReadWithTwoOrThreeHaplotypes(BasicLikelihoodTestProvider cfg) { + double[][] calculatedMatrix = cfg.calcDiploidHaplotypeMatrix(); + double[][] expectedMatrix = cfg.expectedDiploidHaplotypeMatrix(); + logger.warn(String.format("Test: %s", cfg.toString())); + Assert.assertTrue(compareDoubleArrays(calculatedMatrix, expectedMatrix)); + } + */ + + //Private function to compare 2d arrays + private boolean compareDoubleArrays(double[][] b1, double[][] b2) { + if( b1.length != b2.length ) { + return false; // sanity check + } + + for( int i=0; i < b1.length; i++ ){ + if( b1[i].length != b2[i].length) { + return false; // sanity check + } + for( int j=0; j < b1.length; j++ ){ + if ( MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0 && !Double.isInfinite(b1[i][j]) && !Double.isInfinite(b2[i][j])) + return false; + } + } + return true; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java new file mode 100644 index 000000000..309fd2549 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -0,0 +1,400 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class ReferenceConfidenceModelUnitTest extends BaseTest { + GenomeLocParser parser; + final String RGID = "ID1"; + GATKSAMReadGroupRecord rg; + final String sample = "NA12878"; + final Set samples = Collections.singleton(sample); + SAMFileHeader header; + ReferenceConfidenceModel model; + + @BeforeClass + public void setUp() throws Exception { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + rg = new GATKSAMReadGroupRecord(RGID); + rg.setSample(sample); + header.addReadGroup(rg); + parser = new GenomeLocParser(header.getSequenceDictionary()); + } + + @BeforeMethod + public void setupModel() { + model = new ReferenceConfidenceModel(parser, samples, header, 10); + } + + @DataProvider(name = "CalcNIndelInformativeReadsData") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList<>(); + + { // very basic testing + final String ref = "ACGT"; + final String read = "ACGT"; + tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)}); + tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)}); + tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)}); + tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)}); + } + + { // actually interesting case where some sites aren't informative + final String ref = "NNAAAANN"; + final String read1 = "NNA"; + final String read2 = "NNAA"; + final String read3 = "NNAAA"; + final String read4 = "NNAAAA"; + final String read5 = "NNAAAAN"; + tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)}); + } + + { + for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) { + final String anchor = Utils.dupString("N", repeatUnit.length()); + for ( int nUnits = 1; nUnits < 10; nUnits++ ) { + final String repeat = Utils.dupString(repeatUnit, nUnits); + final String ref = anchor + repeat + anchor; + for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) { + final String read = anchor + repeat.substring(0, readLen); + final List expected = new LinkedList<>(); + for ( int i = 0; i < anchor.length(); i++ ) expected.add(1); + for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0); + for ( int i = 0; i < anchor.length(); i++ ) expected.add(0); + tests.add(new Object[]{read, ref, repeatUnit.length(), expected}); + + final List result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1)); + result.addAll(Collections.nCopies(anchor.length(), 0)); + tests.add(new Object[]{ref, ref, repeatUnit.length(), result}); + } + } + + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CalcNIndelInformativeReadsData") + public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List expected ) { + final byte qual = (byte)30; + final byte[] quals = Utils.dupBytes(qual, readBases.length()); + + for ( int i = 0; i < readBases.getBytes().length; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i); + final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize); + Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i); + } + } + + @Test + public void testClose() { + model.close(); + } + + @Test + public void testWorstGL() { + final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100"); + final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200"); + final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200"); + + Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10); + Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10); + Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0); + Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0); + } + + @Test + public void testIndelLikelihoods() { + GenotypeLikelihoods prev = model.getIndelPLs(0); + Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); + Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); + + for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { + final GenotypeLikelihoods current = model.getIndelPLs(i); + final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); + final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); + Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i); + Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i); + Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i); +// logger.warn("result at " + i + " is " + current); + prev = current; + } + } + + @Test + public void testOverlappingVariantContext() { + final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C")); + final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C")); + final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A")); + final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT")); + + final List calls = Arrays.asList(vc13, vc12_15, vc18, vc10); + + checkOverlapping(8, calls, null); + checkOverlapping(9, calls, null); + checkOverlapping(10, calls, vc10); + checkOverlapping(11, calls, null); + checkOverlapping(12, calls, vc12_15); + checkOverlapping(13, calls, vc13); + checkOverlapping(14, calls, vc12_15); + checkOverlapping(15, calls, vc12_15); + checkOverlapping(16, calls, null); + checkOverlapping(17, calls, null); + checkOverlapping(18, calls, vc18); + checkOverlapping(19, calls, null); + checkOverlapping(20, calls, null); + } + + private void checkOverlapping(final int pos, Collection calls, final VariantContext expected) { + final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos); + final VariantContext actual = model.getOverlappingVariantContext(loc, calls); + Assert.assertEquals(actual, expected); + } + + // + // test reference calculation + // + private class RefConfData { + final String ref; + final int extension; + final Haplotype refHap; + final GenomeLoc refLoc, paddedRefLoc; + final ActiveRegion region; + int readCounter = 0; + + private RefConfData(String ref, int extension) { + this.ref = ref; + this.extension = extension; + + refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd()); + paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension); + region = new ActiveRegion(getRefLoc(), parser, extension); + final String pad = Utils.dupString("N", extension); + refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc()); + } + + public GenomeLoc getRefLoc() { return refLoc; } + public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; } + public ActiveRegion getActiveRegion() { return region; } + public Haplotype getRefHap() { return refHap; } + public int getStart() { return 100; } + public int getEnd() { return getStart() + getRefLength() - 1; } + public byte[] getRefBases() { return ref.getBytes(); } + public int getRefLength() { return ref.length(); } + + public GATKSAMRecord makeRead(final int start, final int length) { + final byte[] quals = Utils.dupBytes((byte)30, length); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M"); + read.setReadGroup(rg); + return read; + } + } + + + @DataProvider(name = "RefConfidenceData") + public Object[][] makeRefConfidenceData() { + List tests = new ArrayList<>(); + + for ( int i = 0; i < 10; i++ ) { + for ( final int extension : Arrays.asList(0, 10) ) { + tests.add(new Object[]{i, extension}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "RefConfidenceData") + public void testRefConfidenceBasic(final int nReads, final int extension) { + final RefConfData data = new RefConfData("ACGTAACCGGTT", extension); + final List haplotypes = Arrays.asList(data.getRefHap()); + final List calls = Collections.emptyList(); + + for ( int i = 0; i < nReads; i++ ) { + data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); + } + + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + + @Test + public void testRefConfidencePartialReads() { + final String ref = "ACGTAACCGGTT"; + for ( int readLen = 3; readLen < ref.length(); readLen++ ) { + for ( int start = 0; start < ref.length() - readLen; start++ ) { + final RefConfData data = new RefConfData(ref, 0); + final List haplotypes = Arrays.asList(data.getRefHap()); + final List calls = Collections.emptyList(); + + data.getActiveRegion().add(data.makeRead(start, readLen)); + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); + for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + } + } + + @Test + public void testRefConfidenceWithCalls() { + final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0); + final int start = xxxdata.getStart(); + final int stop = xxxdata.getEnd(); + + for ( int nReads = 0; nReads < 2; nReads++ ) { + + final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C")); + final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C")); + final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C")); + final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A")); + final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG")); + + final List allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns); + + for ( int n = 1; n <= allCalls.size(); n++ ) { + for ( final List calls : Utils.makePermutations(allCalls, n, false) ) { +// logger.warn("Executing " + n + " " + calls.size()); + final RefConfData data = new RefConfData("ACGTAACCGGTT", 0); + final List haplotypes = Arrays.asList(data.getRefHap()); + for ( int i = 0; i < nReads; i++ ) { + data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); + } + + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + } + } + } + + private void checkReferenceModelResult(final RefConfData data, final List contexts, final List expectedDPs, final List calls) { + Assert.assertNotNull(contexts); + + final GenomeLoc loc = data.getActiveRegion().getExtendedLoc(); + final List seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false)); + + for ( int i = 0; i < loc.size(); i++ ) { + final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i); + final VariantContext call = model.getOverlappingVariantContext(curPos, calls); + final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts); + + if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) { + // part of the extended interval, but not the full interval + Assert.assertNull(refModel); + continue; + } + + if ( call != null ) { + Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead"); + } else { + final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart()); + Assert.assertEquals(refModel.getStart(), loc.getStart() + i); + Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); + Assert.assertFalse(refModel.hasLog10PError()); + Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); + Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertTrue(refModel.hasGenotype(sample)); + + final Genotype g = refModel.getGenotype(sample); + Assert.assertTrue(g.hasAD()); + Assert.assertTrue(g.hasDP()); + Assert.assertEquals(g.getDP(), expectedDP); + Assert.assertTrue(g.hasGQ()); + Assert.assertTrue(g.hasPL()); + } + + final VariantContext vc = call == null ? refModel : call; + if ( curPos.getStart() == vc.getStart() ) { + for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) { + final int j = pos - data.getActiveRegion().getLocation().getStart(); + Assert.assertFalse(seenBP.get(j)); + seenBP.set(j, true); + } + } + } + + for ( int i = 0; i < seenBP.size(); i++ ) { + Assert.assertEquals((boolean)seenBP.get(i), true); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java new file mode 100644 index 000000000..44512824a --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -0,0 +1,295 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class BaseGraphUnitTest extends BaseTest { + SeqGraph graph; + SeqVertex v1, v2, v3, v4, v5; + + @BeforeMethod + public void setUp() throws Exception { + graph = new SeqGraph(11); + + v1 = new SeqVertex("A"); + v2 = new SeqVertex("C"); + v3 = new SeqVertex("C"); + v4 = new SeqVertex("C"); + v5 = new SeqVertex("C"); + + graph.addVertices(v1, v2, v3, v4, v5); + graph.addEdge(v1, v2); + graph.addEdge(v2, v4); + graph.addEdge(v3, v2); + graph.addEdge(v2, v3); + graph.addEdge(v4, v5); + } + + @Test + public void testIncomingAndOutgoingVertices() throws Exception { + assertVertexSetEquals(graph.outgoingVerticesOf(v1), v2); + assertVertexSetEquals(graph.incomingVerticesOf(v1)); + + assertVertexSetEquals(graph.outgoingVerticesOf(v2), v3, v4); + assertVertexSetEquals(graph.incomingVerticesOf(v2), v1, v3); + + assertVertexSetEquals(graph.outgoingVerticesOf(v3), v2); + assertVertexSetEquals(graph.incomingVerticesOf(v3), v2); + + assertVertexSetEquals(graph.outgoingVerticesOf(v4), v5); + assertVertexSetEquals(graph.incomingVerticesOf(v4), v2); + + assertVertexSetEquals(graph.outgoingVerticesOf(v5)); + assertVertexSetEquals(graph.incomingVerticesOf(v5), v4); + } + + @Test + public void testRemoveSingletonOrphanVertices() throws Exception { + // all vertices in graph are connected + final List kept = new LinkedList(graph.vertexSet()); + final SeqVertex rm1 = new SeqVertex("CAGT"); + final SeqVertex rm2 = new SeqVertex("AGTC"); + graph.addVertices(rm1, rm2); + Assert.assertEquals(graph.vertexSet().size(), kept.size() + 2); + final BaseEdge rm12e = new BaseEdge(false, 1); + graph.addEdge(rm1, rm2, rm12e); + + final SeqGraph original = (SeqGraph)graph.clone(); + graph.removeSingletonOrphanVertices(); + Assert.assertTrue(BaseGraph.graphEquals(original, graph), "Graph with disconnected component but edges between components shouldn't be modified"); + + graph.removeEdge(rm12e); // now we should be able to remove rm1 and rm2 + graph.removeSingletonOrphanVertices(); + Assert.assertTrue(graph.vertexSet().containsAll(kept)); + Assert.assertFalse(graph.containsVertex(rm1)); + Assert.assertFalse(graph.containsVertex(rm2)); + } + + @Test + public void testRemoveSingletonOrphanVerticesOnSingleRefNode() throws Exception { + final SeqGraph original = new SeqGraph(11); + original.addVertex(v1); + original.removeSingletonOrphanVertices(); + Assert.assertTrue(original.containsVertex(v1)); + Assert.assertEquals(original.vertexSet().size(), 1); + } + + @Test + public void testIsRefSourceAndSink() throws Exception { + + final SeqGraph g = new SeqGraph(11); + g.addVertex(v1); + Assert.assertTrue(g.isRefSource(v1)); + Assert.assertTrue(g.isRefSink(v1)); + Assert.assertTrue(g.isReferenceNode(v1)); + + g.addVertices(v2, v3, v4, v5); + g.addEdge(v1, v2); + g.addEdge(v2, v3); + final BaseEdge refEdge = new BaseEdge(true, 1); + g.addEdge(v3, v4, refEdge); + g.addEdge(v4, v5); + + Assert.assertFalse(g.isRefSource(v1)); + Assert.assertFalse(g.isRefSink(v1)); + Assert.assertFalse(g.isReferenceNode(v1)); + + Assert.assertFalse(g.isRefSource(v2)); + Assert.assertFalse(g.isRefSink(v2)); + Assert.assertFalse(g.isReferenceNode(v2)); + + Assert.assertTrue(g.isRefSource(v3)); + Assert.assertFalse(g.isRefSink(v3)); + Assert.assertTrue(g.isReferenceNode(v3)); + + Assert.assertFalse(g.isRefSource(v4)); + Assert.assertTrue(g.isRefSink(v4)); + Assert.assertTrue(g.isReferenceNode(v4)); + + Assert.assertFalse(g.isRefSource(v5)); + Assert.assertFalse(g.isRefSink(v5)); + Assert.assertFalse(g.isReferenceNode(v5)); + } + + @Test + public void testRemovePathsNotConnectedToRef() throws Exception { + final SeqGraph graph = new SeqGraph(11); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + SeqVertex b7 = new SeqVertex("AAAA"); + SeqVertex b8 = new SeqVertex("GGGG"); + SeqVertex b9 = new SeqVertex("CCCC"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6, b7, b8, b9); + graph.addEdges(src, b1); // source -> b1 is dead + graph.addEdges(b6, src); // x -> source is bad + graph.addEdges(g4, b2); // off random vertex is bad + graph.addEdges(g3, b3, b4); // two vertices that don't connect to end are bad + graph.addEdges(end, b5); // vertex off end is bad + graph.addEdges(g3, b7, b8, b7); // cycle is bad + graph.addEdges(g3, b9, b9); // self-cycle is bad + + final boolean debug = false; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removePathsNotConnectedToRef(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + + @Test + public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception { + final SeqGraph graph = new SeqGraph(11); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex gPrev = new SeqVertex("AA"); + SeqVertex gPrev1 = new SeqVertex("AA"); + SeqVertex gPrev2 = new SeqVertex("AA"); + SeqVertex gAfter = new SeqVertex("AA"); + SeqVertex gAfter1 = new SeqVertex("AA"); + SeqVertex gAfter2 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // these should be kept, but are in the wrong direction + graph.addEdges(gPrev, src); + graph.addEdges(gPrev1, gPrev2, src); + graph.addEdges(end, gAfter); + graph.addEdges(end, gAfter1, gAfter2); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6); + graph.addEdges(b2, b3); // b2 -> b3 + graph.addEdges(b4, b5, b4); // cycle + graph.addEdges(b6, b6); // isolated self cycle + + final boolean debug = false; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + + @Test + public void testPrintEmptyGraph() throws Exception { + final File tmp = File.createTempFile("tmp", "dot"); + tmp.deleteOnExit(); + new SeqGraph(11).printGraph(tmp, 10); + new TestGraph().printGraph(tmp, 10); + } + + @Test + public void testComplexGraph() throws Exception { + final File tmp = File.createTempFile("tmp", "dot"); + tmp.deleteOnExit(); + graph.printGraph(tmp, 10); + } + + private void assertVertexSetEquals(final Collection actual, final SeqVertex ... expected) { + final Set actualSet = new HashSet(actual); + Assert.assertEquals(actualSet.size(), actual.size(), "Duplicate elements found in vertex list"); + final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); + Assert.assertEquals(actualSet, expectedSet); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java new file mode 100644 index 000000000..0ddf7544d --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -0,0 +1,186 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class CommonSuffixMergerUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = true; + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + return makeSplitMergeData(-1); + } + + public static class SplitMergeData { + final SeqGraph graph; + final SeqVertex v; + final String commonSuffix; + + public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) { + this.graph = graph; + this.v = v; + this.commonSuffix = commonSuffix; + } + + @Override + public String toString() { + return "SplitMergeData{" + + "graph=" + graph + + ", v=" + v + + ", commonSuffix='" + commonSuffix + '\'' + + '}'; + } + } + + public static Object[][] makeSplitMergeData(final int maxTests) { + List tests = new ArrayList(); + + final List bases = Arrays.asList("A", "C", "G", "T"); + for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) { + for ( final int nBots : Arrays.asList(0, 1, 2) ) { + for ( final int nMids : Arrays.asList(1, 2, 3) ) { + for ( int nTops = 0; nTops < nMids; nTops++ ) { + for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) { + int multi = 1; + final SeqGraph graph = new SeqGraph(11); + final SeqVertex v = new SeqVertex("GGGG"); + graph.addVertex(v); + + final LinkedList tops = new LinkedList(); + final LinkedList mids = new LinkedList(); + + for ( int i = 0; i < nMids; i++) { + final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix); + graph.addVertex(mid); + graph.addEdge(mid, v, new BaseEdge(i == 0, multi++)); + mids.add(mid); + + tops.add(new SeqVertex(bases.get(i))); + } + + graph.addVertices(tops); + for ( final SeqVertex t : tops ) { + for ( int i = 0; i < nTopConnections; i++ ) { + graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++)); + } + } + + for ( int i = 0; i < nBots; i++ ) { + final SeqVertex bot = new SeqVertex(bases.get(i)); + graph.addVertex(bot); + graph.addEdge(v, bot, new BaseEdge(i == 0, multi++)); + + } + + tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)}); + } + } + } + } + } + + final List toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests)); + return toUse.toArray(new Object[][]{}); + } + + public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { + try { + final Set haplotypes = new HashSet(); + final List originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks()); + final List actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks()); + + for (final KBestHaplotype kbh : originalKBestHaplotypes) + haplotypes.add(new String(kbh.bases())); + + for ( final KBestHaplotype kbh : actualKBestHaplotypes ) { + final String h = new String(kbh.bases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( actualKBestHaplotypes.size() == originalKBestHaplotypes.size() ) { + for ( int i = 0; i < originalKBestHaplotypes.size(); i++ ) { + Assert.assertTrue(actualKBestHaplotypes.get(i).haplotype().getBaseString().equals(originalKBestHaplotypes.get(i).haplotype().getBaseString()), "Paths not equal " + actualKBestHaplotypes.get(i).haplotype() + " vs. original " + originalKBestHaplotypes.get(i).haplotype()); + } + } + } catch ( AssertionError e ) { + if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0); + if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0); + throw e; + } + } + + @Test(dataProvider = "CompleteCycleData") + public void testMerging(final SplitMergeData data) { + final SeqGraph original = (SeqGraph)data.graph.clone(); + final SharedSequenceMerger splitter = new SharedSequenceMerger(); + splitter.merge(data.graph, data.v); + assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } + + @Test + public void testDoesntMergeSourceNodes() { + final SeqGraph g = new SeqGraph(11); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("A"); + final SeqVertex v3 = new SeqVertex("A"); + final SeqVertex top = new SeqVertex("T"); + final SeqVertex b = new SeqVertex("C"); + g.addVertices(top, v1, v2, v3, top, b); + g.addEdges(top, v1, b); + g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged + g.addEdges(top, v3, b); + final SharedSequenceMerger merger = new SharedSequenceMerger(); + Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java new file mode 100644 index 000000000..6dc3d5d67 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java @@ -0,0 +1,518 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 1/31/13 + */ + +public class KBestHaplotypeFinderUnitTest extends BaseTest { + + @DataProvider(name = "BasicPathFindingData") + public Object[][] makeBasicPathFindingData() { + final List tests = new ArrayList<>(); + for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { + for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { + for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { + tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes}); + } + } + } + return tests.toArray(new Object[][]{}); + } + + private static int weight = 1; + final Set createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { + final List seqs = Arrays.asList("A", "C", "G", "T"); + final Set vertices = new LinkedHashSet<>(); + for ( int i = 0; i < n; i++ ) { + final SeqVertex v = new SeqVertex(seqs.get(i)); + graph.addVertex(v); + vertices.add(v); + if ( source != null ) graph.addEdge(source, v, new BaseEdge(false, weight++)); + if ( target != null ) graph.addEdge(v, target, new BaseEdge(false, weight++)); + } + return vertices; + } + + @Test(dataProvider = "BasicPathFindingData") + public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes) { + final SeqGraph graph = new SeqGraph(11); + + final SeqVertex middleTop = new SeqVertex("GTAC"); + final SeqVertex middleBottom = new SeqVertex("ACTG"); + graph.addVertices(middleTop, middleBottom); + final Set starts = createVertices(graph, nStartNodes, null, middleTop); + @SuppressWarnings("unused") + final Set bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); + final Set ends = createVertices(graph, nEndNodes, middleBottom, null); + + // enumerate all possible paths + final List paths = new KBestHaplotypeFinder(graph, starts, ends); + + final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes; + Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); + + int lastScore = Integer.MAX_VALUE; + for ( final KBestHaplotype kbh : paths ) { + final Path path = kbh.path(); + Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore); + lastScore = path.getScore(); + } + + // get the best path, and make sure it's the same as our optimal path overall + final Path best = paths.get(0).path(); + final List justOne = new KBestHaplotypeFinder(graph,starts, ends).subList(0,1); + Assert.assertEquals(justOne.size(), 1); + + Assert.assertTrue(justOne.get(0).path().pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); + } + + @DataProvider(name = "BasicBubbleDataProvider") + public Object[][] makeBasicBubbleDataProvider() { + final List tests = new ArrayList<>(); + for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { + for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { + tests.add(new Object[]{refBubbleLength, altBubbleLength}); + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BasicBubbleDataProvider") + public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(3); + final String preRef = "ATGG"; + final String postRef = "GGGGC"; + + SeqVertex v = new SeqVertex(preRef); + SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); + SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); + SeqVertex v3 = new SeqVertex(postRef); + + graph.addVertex(v); + graph.addVertex(v2Ref); + graph.addVertex(v2Alt); + graph.addVertex(v3); + graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); + graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); + graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); + graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); + + // Construct the test path + Path path = new Path<>(v, graph); + path = new Path<>(path, graph.getEdge(v, v2Alt)); + path = new Path<>(path, graph.getEdge(v2Alt, v3)); + + // Construct the actual cigar string implied by the test path + Cigar expectedCigar = new Cigar(); + expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); + if( refBubbleLength > altBubbleLength ) { + expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); + expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); + } else if ( refBubbleLength < altBubbleLength ) { + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); + expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); + } else { + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); + } + expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); + + final String ref = preRef + v2Ref.getSequenceString() + postRef; + Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); + } + + @DataProvider(name = "GetBasesData") + public Object[][] makeGetBasesData() { + List tests = new ArrayList<>(); + + final List frags = Arrays.asList("ACT", "GAC", "CAT"); + + for ( int n = 1; n <= frags.size(); n++ ) { + for ( final List comb : Utils.makePermutations(frags, n, false) ) { + tests.add(new Object[]{comb}); + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetBasesData") + public void testGetBases(final List frags) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(3); + + SeqVertex prev = null; + for (final String s : frags) { + SeqVertex v = new SeqVertex(s); + graph.addVertex(v); + if ( prev != null ) + graph.addEdge(prev, v); + prev = v; + } + + // enumerate all possible paths + final List paths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks()); + Assert.assertEquals(paths.size(), 1); + final Path path = paths.get(0).path(); + Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence"); + } + + @DataProvider(name = "TripleBubbleDataProvider") + public Object[][] makeTripleBubbleDataProvider() { + final List tests = new ArrayList<>(); + for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { + for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { + for ( final boolean offRefEnding : Arrays.asList(true, false) ) { + for ( final boolean offRefBeginning : Arrays.asList(false) ) { + tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding}); + } + } + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TripleBubbleDataProvider") + public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(11); + final String preAltOption = "ATCGATCGATCGATCGATCG"; + final String postAltOption = "CCCC"; + final String preRef = "ATGG"; + final String postRef = "GGCCG"; + final String midRef1 = "TTCCT"; + final String midRef2 = "CCCAAAAAAAAAAAA"; + + SeqVertex preV = new SeqVertex(preAltOption); + SeqVertex v = new SeqVertex(preRef); + SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); + SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); + SeqVertex v4Ref = new SeqVertex(Utils.dupString('C', refBubbleLength)); + SeqVertex v4Alt = new SeqVertex(Utils.dupString('C', altBubbleLength-1) + "T"); + SeqVertex v6Ref = new SeqVertex(Utils.dupString('G', refBubbleLength)); + SeqVertex v6Alt = new SeqVertex(Utils.dupString('G', altBubbleLength-1) + "T"); + SeqVertex v3 = new SeqVertex(midRef1); + SeqVertex v5 = new SeqVertex(midRef2); + SeqVertex v7 = new SeqVertex(postRef); + SeqVertex postV = new SeqVertex(postAltOption); + + final String ref = preRef + v2Ref.getSequenceString() + midRef1 + v4Ref.getSequenceString() + midRef2 + v6Ref.getSequenceString() + postRef; + + graph.addVertex(preV); + graph.addVertex(v); + graph.addVertex(v2Ref); + graph.addVertex(v2Alt); + graph.addVertex(v3); + graph.addVertex(v4Ref); + graph.addVertex(v4Alt); + graph.addVertex(v5); + graph.addVertex(v6Ref); + graph.addVertex(v6Alt); + graph.addVertex(v7); + graph.addVertex(postV); + graph.addEdge(preV, v, new BaseEdge(false, 1)); + graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); + graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); + graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); + graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); + graph.addEdge(v3, v4Ref, new BaseEdge(true, 10)); + graph.addEdge(v4Ref, v5, new BaseEdge(true, 10)); + graph.addEdge(v3, v4Alt, new BaseEdge(false, 5)); + graph.addEdge(v4Alt, v5, new BaseEdge(false, 5)); + graph.addEdge(v5, v6Ref, new BaseEdge(true, 11)); + graph.addEdge(v6Ref, v7, new BaseEdge(true, 11)); + graph.addEdge(v5, v6Alt, new BaseEdge(false, 55)); + graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); + graph.addEdge(v7, postV, new BaseEdge(false, 1)); + + // Construct the test path + Path path = new Path<>( (offRefBeginning ? preV : v), graph); + if( offRefBeginning ) + path = new Path<>(path, graph.getEdge(preV, v)); + path = new Path<>(path, graph.getEdge(v, v2Alt)); + path = new Path<>(path, graph.getEdge(v2Alt, v3)); + path = new Path<>(path, graph.getEdge(v3, v4Ref)); + path = new Path<>(path, graph.getEdge(v4Ref, v5)); + path = new Path<>(path, graph.getEdge(v5, v6Alt)); + path = new Path<>(path, graph.getEdge(v6Alt, v7)); + if( offRefEnding ) + path = new Path<>(path, graph.getEdge(v7,postV)); + + // Construct the actual cigar string implied by the test path + Cigar expectedCigar = new Cigar(); + if( offRefBeginning ) { + expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I)); + } + expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); + // first bubble + if( refBubbleLength > altBubbleLength ) { + expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); + expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M)); + } else if ( refBubbleLength < altBubbleLength ) { + expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M)); + expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); + } else { + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); + } + expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M)); + // second bubble is ref path + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); + expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M)); + // third bubble + if( refBubbleLength > altBubbleLength ) { + expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); + expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M)); + } else if ( refBubbleLength < altBubbleLength ) { + expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M)); + expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); + } else { + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); + } + expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); + if( offRefEnding ) { + expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I)); + } + + Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), + AlignmentUtils.consolidateCigar(expectedCigar).toString(), + "Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases())); + } + + @Test + public void testIntraNodeInsertionDeletion() { + // Construct the assembly graph + final SeqGraph graph = new SeqGraph(11); + final SeqVertex top = new SeqVertex("T"); + final SeqVertex bot = new SeqVertex("T"); + final SeqVertex alt = new SeqVertex("AAACCCCC"); + final SeqVertex ref = new SeqVertex("CCCCCGGG"); + + graph.addVertices(top, bot, alt, ref); + graph.addEdges(new BaseEdge(true, 1), top, ref, bot); + graph.addEdges(new BaseEdge(false, 1), top, alt, bot); + + @SuppressWarnings("all") + final KBestHaplotypeFinder bestPathFinder = new KBestHaplotypeFinder(graph,top,bot); + + Assert.assertEquals(bestPathFinder.size(), 2); + + final Path refPath = bestPathFinder.get(0).path(); + final Path altPath = bestPathFinder.get(1).path(); + + final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); + Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "10M"); + Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "1M3I5M3D1M"); + } + + @Test + public void testHardSWPath() { + // Construct the assembly graph + final SeqGraph graph = new SeqGraph(11); + final SeqVertex top = new SeqVertex( "NNN" ); + final SeqVertex bot = new SeqVertex( "NNN" ); + final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); + final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); + graph.addVertices(top, bot, alt, ref); + graph.addEdges(new BaseEdge(true, 1), top, ref, bot); + graph.addEdges(new BaseEdge(false, 1), top, alt, bot); + + @SuppressWarnings("all") + final List paths = new KBestHaplotypeFinder(graph, top, bot); + + Assert.assertEquals(paths.size(), 2); + + final Path refPath = paths.get(0).path(); + final Path altPath = paths.get(1).path(); + + final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); + + logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar(refString.getBytes())); + logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar(refString.getBytes())); + + Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "51M"); + Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "3M6I48M"); + } + + // ----------------------------------------------------------------- + // + // Systematic tests to ensure that we get the correct SW result for + // a variety of variants in the ref vs alt bubble + // + // ----------------------------------------------------------------- + + @DataProvider(name = "SystematicRefAltSWTestData") + public Object[][] makeSystematicRefAltSWTestData() { + final List tests = new ArrayList<>(); + + final List> allDiffs = Arrays.asList( + Arrays.asList("G", "C", "1M"), + Arrays.asList("G", "", "1D"), + Arrays.asList("", "C", "1I"), + Arrays.asList("AAA", "CGT", "3M"), + Arrays.asList("TAT", "CAC", "3M"), + Arrays.asList("GCTG", "GTCG", "4M"), + Arrays.asList("AAAAA", "", "5D"), + Arrays.asList("", "AAAAA", "5I"), + Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I") + ); + + for ( final String prefix : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { + for ( final String end : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { + for ( final List diffs : allDiffs ) + tests.add(new Object[]{prefix, end, diffs.get(0), diffs.get(1), diffs.get(2)}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SystematicRefAltSWTestData") + public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(11); + + final int padSize = 0; + SeqVertex top = new SeqVertex(Utils.dupString("N", padSize)); + SeqVertex ref = new SeqVertex(prefix + refMid + end); + SeqVertex alt = new SeqVertex(prefix + altMid + end); + SeqVertex bot = new SeqVertex(Utils.dupString("N", padSize)); + + graph.addVertices(top, ref, alt, bot); + graph.addEdges(new BaseEdge(true, 1), top, ref, bot); + graph.addEdges(new BaseEdge(false, 1), top, alt, bot); + + // Construct the test path + Path path = Path.makePath(Arrays.asList(top, alt, bot), graph); + + Cigar expected = new Cigar(); + expected.add(new CigarElement(padSize, CigarOperator.M)); + if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M)); + for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt); + if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M)); + expected.add(new CigarElement(padSize, CigarOperator.M)); + expected = AlignmentUtils.consolidateCigar(expected); + + final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); + final Cigar pathCigar = path.calculateCigar(refString.getBytes()); + + logger.warn("diffs: " + ref + " vs. " + alt + " cigar " + midCigar); + logger.warn("Path " + path + " with cigar " + pathCigar); + logger.warn("Expected cigar " + expected); + + Assert.assertEquals(pathCigar, expected, "Cigar mismatch: ref = " + refString + " vs alt = " + new String(path.getBases())); + } + + @Test + public void testLeftAlignCigarSequentially() { + String preRefString = "GATCGATCGATC"; + String postRefString = "TTT"; + String refString = "ATCGAGGAGAGCGCCCCG"; + String indelString1 = "X"; + String indelString2 = "YZ"; + int refIndel1 = 10; + int refIndel2 = 12; + + for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) { + for ( final int indelOp1 : Arrays.asList(1, -1) ) { + for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) { + for ( final int indelOp2 : Arrays.asList(1, -1) ) { + + Cigar expectedCigar = new Cigar(); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); + expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M)); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); + expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M)); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + + Cigar givenCigar = new Cigar(); + givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M)); + givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); + givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M)); + givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); + givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M)); + + String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; + String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; + + Cigar calculatedCigar = CigarUtils.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); + Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); + } + } + } + } + } + + @Test(enabled = true) + public void testLeftAlignCigarSequentiallyAdjacentID() { + final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT"; + final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT"; + final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M"); + + final Cigar result = CigarUtils.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); + logger.warn("Result is " + result); + Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java new file mode 100644 index 000000000..aeb617b18 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.jgrapht.EdgeFactory; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 9/5/13 + * Time: 11:04 AM + * To change this template use File | Settings | File Templates. + */ +public class RouteUnitTest extends BaseTest { + + @Test(dataProvider="slicePrefixTestData") + public void testSplicePrefix(final Route route) { + final int routeLength = route.length(); + for (int i = 0; i < routeLength; i++) { + final Route spliced = route.splicePrefix(i); + Assert.assertEquals(spliced.length(),route.length() - i); + final List routeEdges = route.getEdges(); + final List expectedSlicedEdges = routeEdges.subList(i,routeLength); + Assert.assertEquals(spliced.getEdges(),expectedSlicedEdges); + } + } + + @Test(dataProvider="isSuffixTestData") + public void testIsSuffix(final Route route, final Path path, final boolean expectedResult) { + Assert.assertEquals(route.isSuffix(path), expectedResult); + } + + @DataProvider(name="isSuffixTestData") + public Iterator isSuffixTestData() { + return IS_SUFFIX_TEST_DATA.iterator(); + } + + @DataProvider(name="slicePrefixTestData") + public Iterator slicePrefixTestData() { + return Arrays.asList(SLICE_PREFIX_TEST_DATA).iterator(); + } + + private static final int[] TEST_EDGE_PAIRS1 = new int[] { + 3 , 4, + 4 , 5, + 5, 7, + 7, 8, + 8, 9, + 4 , 6, + 6, 9, + 9, 11, + 11, 12, + }; + + private static final int[] TEST_EDGE_PAIRS = new int[] { + 1 , 2, + 2 , 3, + 3 , 4, + 4 , 5, + 5, 7, + 7, 8, + 8, 9, + 4 , 6, + 6, 9, + 9, 10, + 10, 11, + 11, 12, + 2, 5, + 5, 12, + + 3, 13, + 13, 14, + 14, 15 + }; + + public static final EdgeFactory TEST_GRAPH_EDGE_FACTORY = new EdgeFactory() { + @Override + public BaseEdge createEdge(final BaseVertex baseVertex, final BaseVertex baseVertex2) { + return new BaseEdge(false, 0); + } + }; + + + private static Map vertexByInteger = new HashMap<>(); + private static final BaseGraph TEST_GRAPH = new BaseGraph<>(1, TEST_GRAPH_EDGE_FACTORY); + private static final List IS_SUFFIX_TEST_DATA; + + private static final Object[][] SLICE_PREFIX_TEST_DATA; + + static { + for (int i = 0; i < TEST_EDGE_PAIRS.length; i += 2) { + final int sourceInteger = TEST_EDGE_PAIRS[i]; + final int targetInteger = TEST_EDGE_PAIRS[i + 1]; + final BaseVertex sourceVertex = resolveVertexByInteger(sourceInteger); + final BaseVertex targetVertex = resolveVertexByInteger(targetInteger); + TEST_GRAPH.addEdge(sourceVertex, targetVertex); + } + Assert.assertEquals(1,TEST_GRAPH.getSources().size()); + final Deque> pendingPaths = new LinkedList<>(); + final Deque> pendingRoutes = new LinkedList<>(); + final List> allPossiblePaths = new LinkedList<>(); + final List> allPossibleRoutes = new LinkedList<>(); + for (final BaseVertex vertex : TEST_GRAPH.vertexSet()) { + pendingPaths.add(new Path(vertex, TEST_GRAPH)); + pendingRoutes.add(new Route(vertex,TEST_GRAPH)); + } + while (!pendingPaths.isEmpty()) { // !pendingRoutes.isEmpty(); + final Path path = pendingPaths.remove(); + final Route route = pendingRoutes.remove(); + final BaseVertex lastVertex = path.getLastVertex(); + allPossiblePaths.add(path); + allPossibleRoutes.add(route); + + if (allPossiblePaths.size() % 100 == 0) + Reporter.log("" + allPossiblePaths.size(), true); + for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) + pendingPaths.add(new Path<>(path,edge)); + for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) + pendingRoutes.add(new Route<>(route,edge)); + } + + final int numberOfPaths = allPossiblePaths.size(); + final boolean[][] isSuffix = buildIsSuffixMatrix(allPossiblePaths, numberOfPaths); + IS_SUFFIX_TEST_DATA = createTestData(allPossiblePaths,allPossibleRoutes,isSuffix); + SLICE_PREFIX_TEST_DATA = createSlicePrefixTestData(allPossibleRoutes); + } + + private static Object[][] createSlicePrefixTestData(List> allPossibleRoutes) { + final Object[][] result = new Object[allPossibleRoutes.size()][1]; + final Object[] routes = allPossibleRoutes.toArray(); + for (int i = 0; i < result.length; i++) + result[i][0] = routes[i]; + return result; + } + + private static boolean[][] buildIsSuffixMatrix(final List> allPossiblePaths, final int numberOfPaths) { + final boolean[][] isSuffix = new boolean[numberOfPaths][numberOfPaths]; + final ListIterator> iIterator = allPossiblePaths.listIterator(); + for (int i = 0; i < numberOfPaths; i++) { + isSuffix[i][i] = true; + final ListIterator> jIterator = allPossiblePaths.listIterator(i + 1); + final Path iPath = iIterator.next(); + for (int j = i + 1; j < numberOfPaths; j++) { + final Path jPath = jIterator.next(); + if (iPath.getLastVertex() != jPath.getLastVertex()) { + isSuffix[i][j] = isSuffix[j][i] = false; + } else { + isSuffix[i][j] = isSuffix[j][i] = true; // let assume they are suffix of each other by default. + final Path shortPath; + final Path longPath; + if (iPath.getEdges().size() <= jPath.getEdges().size()) { + shortPath = iPath; + longPath = jPath; + } else { + longPath = iPath; + shortPath = jPath; + } + final ListIterator longPathEdgesIterator = longPath.getEdges().listIterator(longPath.getEdges().size()); + final ListIterator shortPathEdgesIterator = shortPath.getEdges().listIterator(shortPath.getEdges().size()); + + while (shortPathEdgesIterator.hasPrevious()) { + final BaseEdge shortEdge = shortPathEdgesIterator.previous(); + final BaseEdge longEdge = longPathEdgesIterator.previous(); + if (shortEdge != longEdge) { + isSuffix[i][j] = isSuffix[j][i] = false; + break; + } + } + if (isSuffix[i][j]) { + if (longPathEdgesIterator.hasPrevious()) { + if (longPath == iPath) + isSuffix[j][i] = false; + else + isSuffix[i][j] = false; + } + } + } + + } + } + return isSuffix; + } + + private static List createTestData(final List> allPossiblePaths, final List> allPossibleRoutes, final boolean[][] isSuffix) { + final List result = new ArrayList<>(allPossiblePaths.size() * allPossiblePaths.size() * 2 ); + final Path[] allPaths = allPossiblePaths.toArray(new Path[allPossiblePaths.size()]); + final Route[] allRoutes = allPossibleRoutes.toArray(new Route[allPossibleRoutes.size()]); + final int numberOfPaths = allPaths.length; + for (int i = 0; i < numberOfPaths; i++) + for (int j = 0; j < numberOfPaths; j++) { + result.add(new Object[] { allRoutes[i], allPaths[j], isSuffix[i][j] }); + result.add(new Object[] { allRoutes[i], allRoutes[j], isSuffix[i][j] }); + result.add(new Object[] { allRoutes[i], inverseRebuild(allRoutes[j]), isSuffix[i][j]}); + } + + return result; + } + + private static Route inverseRebuild(final Route original) { + final ListIterator it = original.getEdges().listIterator(original.length()); + Route result = new Route<>(original.getLastVertex(),original.getGraph()); + while (it.hasPrevious()) { + result = new Route<>(it.previous(),result); + } + return result; + } + + private static BaseVertex resolveVertexByInteger(final int targetInteger) { + if (vertexByInteger.containsKey(targetInteger)) + return vertexByInteger.get(targetInteger); + else { + int value = targetInteger; + final StringBuffer stringBuffer = new StringBuffer(); + while (value > 0) { + int c = value % 4; + switch (c) { + case 0: stringBuffer.append('A'); break; + case 1: stringBuffer.append('C'); break; + case 2: stringBuffer.append('G'); break; + case 3: stringBuffer.append('T'); break; + } + value = value / 4; + } + if (stringBuffer.length() == 0) stringBuffer.append('A'); + final byte[] sequence = stringBuffer.reverse().toString().getBytes(); + final BaseVertex result = new BaseVertex(sequence); + vertexByInteger.put(targetInteger, result); + TEST_GRAPH.addVertex(result); + return result; + } + + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java new file mode 100644 index 000000000..2f44129d8 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -0,0 +1,293 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class SharedVertexSequenceSplitterUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = false; + + @DataProvider(name = "PrefixSuffixData") + public Object[][] makePrefixSuffixData() { + final List tests = new ArrayList<>(); + + tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0}); + tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1}); + tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4}); + + tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0}); + tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0}); + tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0}); + + tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List bytes = new ArrayList<>(); + int min = Integer.MAX_VALUE; + for ( final String s : strings ) { + bytes.add(s.getBytes()); + min = Math.min(min, s.length()); + } + + final int actualPrefixLen = GraphUtils.compPrefixLen(bytes, min); + Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); + + final int actualSuffixLen = GraphUtils.compSuffixLen(bytes, min - actualPrefixLen); + Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List v = new ArrayList<>(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final Pair result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v); + Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test"); + Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test"); + + Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty()); + Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty()); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final SeqGraph graph = new SeqGraph(11); + + final List v = new ArrayList<>(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + graph.addVertices(v.toArray(new SeqVertex[v.size()])); + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); + + Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); + Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); + + Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0); + + Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0); + + for ( final SeqVertex mid : splitter.newMiddles ) { + Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid)); + Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV)); + } + } + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + List tests = new ArrayList<>(); + + for ( final boolean hasTop : Arrays.asList(true, false) ) { + for ( final boolean hasBot : Arrays.asList(true, false) ) { + if ( ! hasTop && ! hasBot ) continue; + tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot}); + + tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot}); + // all merged + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot}); + // prefix and suffix + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot}); + // 2 -> prefix, leave C + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot}); + // 2 -> prefix, leave CCC + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot}); + // 2 -> suffix, leave A/T + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot}); + // 2 -> suffix, leave T, delete 1 + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CompleteCycleData") + public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) { + final SeqGraph graph = new SeqGraph(11); + + int edgeWeight = 1; + final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; + final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; + final List v = new ArrayList<>(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + graph.addVertices(v.toArray(new SeqVertex[v.size()])); + final SeqVertex first = v.get(0); + + if ( hasTop ) { + graph.addVertex(top); + for ( final SeqVertex vi : v ) + graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++)); + } + + if ( hasBot ) { + graph.addVertex(bot); + for ( final SeqVertex vi : v ) + graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); + } + + final Set haplotypes = new HashSet<>(); + final List originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks()); + for ( final KBestHaplotype path : originalPaths ) + haplotypes.add(new String(path.bases())); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0); + if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0); + splitter.updateGraph(top, bot); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); + + final List splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks()); + for ( final KBestHaplotype path : splitPaths ) { + final String h = new String(path.bases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).path().equalScoreAndSequence(originalPaths.get(i).path()), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } + + @DataProvider(name = "MeetsMinSequenceData") + public Object[][] makeMeetsMinSequenceData() { + final List tests = new ArrayList<>(); + + final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging(); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 1, prefixBiased, ! prefixBiased}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 2, prefixBiased, ! prefixBiased}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 3, false, false}); + tests.add(new Object[]{Arrays.asList("A", "AC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("A", "AC"), 2, false, false}); + tests.add(new Object[]{Arrays.asList("AT", "AC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 2, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 3, false, false}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 1, true, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 2, true, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 3, false, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 4, false, false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MeetsMinSequenceData") + public void testSplitterCompleteCycle(final List mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) { + final SeqGraph graph = new SeqGraph(11); + + final SeqVertex top = new SeqVertex("AAAAAAAA"); + final SeqVertex bot = new SeqVertex("GGGGGGGG"); + final List v = new ArrayList<>(); + for ( final String s : mids ) { v.add(new SeqVertex(s)); } + graph.addVertices(v.toArray(new SeqVertex[v.size()])); + graph.addVertices(top, bot); + for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); } + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + Assert.assertEquals(splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed"); + Assert.assertEquals(splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed"); + Assert.assertEquals(splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java new file mode 100644 index 000000000..a13bc4754 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java @@ -0,0 +1,243 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.KBestHaplotype; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.KBestHaplotypeFinder; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +public class DanglingChainMergingGraphUnitTest extends BaseTest { + + public static byte[] getBytes(final String alignment) { + return alignment.replace("-","").getBytes(); + } + + @DataProvider(name = "DanglingTails") + public Object[][] makeDanglingTailsData() { + List tests = new ArrayList<>(); + + // add 1M to the expected CIGAR because it includes the previous (common) base too + tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype + tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion + tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion + tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp + tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps + tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment + tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data + tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch + tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex + tests.add(new Object[]{"AAAAA", "XXXXX", "1M5I", false, -1}); // insertion + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DanglingTails") + public void testDanglingTails(final String refEnd, + final String altEnd, + final String cigar, + final boolean cigarIsGood, + final int mergePointDistanceFromSink) { + + final int kmerSize = 15; + + // construct the haplotypes + final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; + final String ref = commonPrefix + refEnd; + final String alt = commonPrefix + altEnd; + + // create the graph and populate it + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); + rtgraph.addSequence("ref", ref.getBytes(), true); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); + rtgraph.addRead(read); + rtgraph.buildGraphIfNecessary(); + + // confirm that we have just a single dangling tail + MultiDeBruijnVertex altSink = null; + for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { + if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { + Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); + altSink = v; + } + } + + Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); + + // confirm that the SW alignment agrees with our expectations + final ReadThreadingGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstDownwardsReferencePath(altSink, 0); + + if ( result == null ) { + Assert.assertFalse(cigarIsGood); + return; + } + + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); + + // confirm that the goodness of the cigar agrees with our expectations + Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar, false, true), cigarIsGood); + + // confirm that the tail merging works as expected + if ( cigarIsGood ) { + final int mergeResult = rtgraph.mergeDanglingTail(result); + Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); + + // confirm that we created the appropriate edge + if ( mergePointDistanceFromSink >= 0 ) { + MultiDeBruijnVertex v = altSink; + for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { + if ( rtgraph.inDegreeOf(v) != 1 ) + Assert.fail("Encountered vertex with multiple edges"); + v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); + } + Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); + } + } + } + + @Test + public void testWholeTailIsInsertion() { + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(10); + final ReadThreadingGraph.DanglingChainMergeHelper result = new ReadThreadingGraph.DanglingChainMergeHelper(null, null, "AXXXXX".getBytes(), "AAAAAA".getBytes(), new TextCigarCodec().decode("5I1M")); + final int mergeResult = rtgraph.mergeDanglingTail(result); + Assert.assertEquals(mergeResult, 0); + } + + @Test + public void testGetBasesForPath() { + + final int kmerSize = 4; + final String testString = "AATGGGGCAATACTA"; + + final ReadThreadingGraph graph = new ReadThreadingGraph(kmerSize); + graph.addSequence(testString.getBytes(), true); + graph.buildGraphIfNecessary(); + + final List vertexes = new ArrayList<>(); + MultiDeBruijnVertex v = graph.getReferenceSourceVertex(); + while ( v != null ) { + vertexes.add(v); + v = graph.getNextReferenceVertex(v); + } + + final String result = new String(graph.getBasesForPath(vertexes, false)); + Assert.assertEquals(result, testString); + } + + @DataProvider(name = "DanglingHeads") + public Object[][] makeDanglingHeadsData() { + List tests = new ArrayList<>(); + + // add 1M to the expected CIGAR because it includes the last (common) base too + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "AAYCGGTTACGT", "8M", true}); // 1 snp + tests.add(new Object[]{"XXXAACCGGTTACGT", "XAAACCGGTTACGT", "7M", false}); // 1 snp + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "XAACGGTTACGT", "4M1D4M", false}); // deletion + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "AYYCGGTTACGT", "8M", true}); // 2 snps + tests.add(new Object[]{"XXXXXXXAACCGGTTACGTAA", "AYCYGGTTACGTAA", "9M", true}); // 2 snps + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "AYCGGTTACGT", "7M", true}); // very little data + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "YCCGGTTACGT", "6M", true}); // begins in mismatch + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DanglingHeads") + public void testDanglingHeads(final String ref, + final String alt, + final String cigar, + final boolean shouldBeMerged) { + + final int kmerSize = 5; + + // create the graph and populate it + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); + rtgraph.addSequence("ref", ref.getBytes(), true); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); + rtgraph.addRead(read); + rtgraph.buildGraphIfNecessary(); + + // confirm that we have just a single dangling head + MultiDeBruijnVertex altSource = null; + for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { + if ( rtgraph.isSource(v) && !rtgraph.isReferenceNode(v) ) { + Assert.assertTrue(altSource == null, "We found more than one non-reference source"); + altSource = v; + } + } + + Assert.assertTrue(altSource != null, "We did not find a non-reference source"); + + // confirm that the SW alignment agrees with our expectations + final ReadThreadingGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstUpwardsReferencePath(altSource, 0); + + if ( result == null ) { + Assert.assertFalse(shouldBeMerged); + return; + } + + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); + + // confirm that the tail merging works as expected + final int mergeResult = rtgraph.mergeDanglingHead(result); + Assert.assertTrue(mergeResult > 0 || !shouldBeMerged); + + // confirm that we created the appropriate bubble in the graph only if expected + rtgraph.cleanNonRefPaths(); + final SeqGraph seqGraph = rtgraph.convertToSequenceGraph(); + final List paths = new KBestHaplotypeFinder(seqGraph, seqGraph.getReferenceSourceVertex(), seqGraph.getReferenceSinkVertex()); + Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java new file mode 100644 index 000000000..769026f2b --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -0,0 +1,237 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class ReadThreadingAssemblerUnitTest extends BaseTest { + + private final static boolean DEBUG = false; + + private static class TestAssembler { + final ReadThreadingAssembler assembler; + + Haplotype refHaplotype; + final List reads = new LinkedList<>(); + + private TestAssembler(final int kmerSize) { + this.assembler = new ReadThreadingAssembler(100000, Arrays.asList(kmerSize)); + assembler.setJustReturnRawGraph(true); + assembler.setPruneFactor(0); + } + + public void addSequence(final byte[] bases, final boolean isRef) { + if ( isRef ) { + refHaplotype = new Haplotype(bases, true); + } else { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte)30,bases.length), bases.length + "M"); + reads.add(read); + } + } + + public SeqGraph assemble() { + assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests + assembler.setRecoverDanglingTails(false); // needed to pass some of the tests + assembler.setDebugGraphTransformations(true); + final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList()).get(0).getGraph(); + if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); + return graph; + } + } + + private void assertLinearGraph(final TestAssembler assembler, final String seq) { + final SeqGraph graph = assembler.assemble(); + graph.simplifyGraph(); + Assert.assertEquals(graph.vertexSet().size(), 1); + Assert.assertEquals(graph.vertexSet().iterator().next().getSequenceString(), seq); + } + + private void assertSingleBubble(final TestAssembler assembler, final String one, final String two) { + final SeqGraph graph = assembler.assemble(); + graph.simplifyGraph(); + final List paths = new KBestHaplotypeFinder(graph); + Assert.assertEquals(paths.size(), 2); + final Set expected = new HashSet<>(Arrays.asList(one, two)); + for ( final KBestHaplotype path : paths ) { + final String seq = new String(path.bases()); + Assert.assertTrue(expected.contains(seq)); + expected.remove(seq); + } + } + + @Test(enabled = ! DEBUG) + public void testRefCreation() { + final String ref = "ACGTAACCGGTT"; + final TestAssembler assembler = new TestAssembler(3); + assembler.addSequence(ref.getBytes(), true); + assertLinearGraph(assembler, ref); + } + + @Test(enabled = ! DEBUG) + public void testRefNonUniqueCreation() { + final String ref = "GAAAAT"; + final TestAssembler assembler = new TestAssembler(3); + assembler.addSequence(ref.getBytes(), true); + assertLinearGraph(assembler, ref); + } + + @Test(enabled = ! DEBUG) + public void testRefAltCreation() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACAACTGA"; + final String alt = "ACAGCTGA"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(alt.getBytes(), false); + assertSingleBubble(assembler, ref, alt); + } + + @Test(enabled = ! DEBUG) + public void testPartialReadsCreation() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACAACTGA"; + final String alt1 = "ACAGCT"; + final String alt2 = "GCTGA"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(alt1.getBytes(), false); + assembler.addSequence(alt2.getBytes(), false); + assertSingleBubble(assembler, ref, "ACAGCTGA"); + } + + @Test(enabled = ! DEBUG) + public void testMismatchInFirstKmer() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACAACTGA"; + final String alt = "AGCTGA"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(alt.getBytes(), false); + + final SeqGraph graph = assembler.assemble(); + graph.simplifyGraph(); + graph.removeSingletonOrphanVertices(); + final Set sources = graph.getSources(); + final Set sinks = graph.getSinks(); + + Assert.assertEquals(sources.size(), 1); + Assert.assertEquals(sinks.size(), 1); + Assert.assertNotNull(graph.getReferenceSourceVertex()); + Assert.assertNotNull(graph.getReferenceSinkVertex()); + + final List paths = new KBestHaplotypeFinder(graph); + Assert.assertEquals(paths.size(), 2); + } + + @Test(enabled = ! DEBUG) + public void testStartInMiddle() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "CAAAATG"; + final String read = "AAATG"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(read.getBytes(), false); + assertLinearGraph(assembler, ref); + } + + @Test(enabled = ! DEBUG) + public void testStartInMiddleWithBubble() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "CAAAATGGGG"; + final String read = "AAATCGGG"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(read.getBytes(), false); + assertSingleBubble(assembler, ref, "CAAAATCGGG"); + } + + @Test(enabled = ! DEBUG) + public void testNoGoodStarts() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "CAAAATGGGG"; + final String read = "AAATCGGG"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(read.getBytes(), false); + assertSingleBubble(assembler, ref, "CAAAATCGGG"); + } + + + @Test(enabled = !DEBUG) + public void testCreateWithBasesBeforeRefSource() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACTG"; + final String read = "CTGGGACT"; + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true); + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read), false); + assertLinearGraph(assembler, "ACTGGGACT"); + } + + @Test(enabled = !DEBUG) + public void testSingleIndelAsDoubleIndel3Reads() { + final TestAssembler assembler = new TestAssembler(25); + // The single indel spans two repetitive structures + final String ref = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCTCTCTGTGTGTGTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; + final String read1 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; + final String read2 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true); + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read1), false); + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read2), false); + + final SeqGraph graph = assembler.assemble(); + final List paths = new KBestHaplotypeFinder(graph); + Assert.assertEquals(paths.size(), 2); + final byte[] refPath = paths.get(0).bases().length == ref.length() ? paths.get(0).bases() : paths.get(1).bases(); + final byte[] altPath = paths.get(0).bases().length == ref.length() ? paths.get(1).bases() : paths.get(0).bases(); + Assert.assertEquals(refPath, ReadThreadingGraphUnitTest.getBytes(ref)); + Assert.assertEquals(altPath, ReadThreadingGraphUnitTest.getBytes(read1)); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java new file mode 100644 index 000000000..c95f4002e --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -0,0 +1,260 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class ReadThreadingGraphUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + public static byte[] getBytes(final String alignment) { + return alignment.replace("-","").getBytes(); + } + + private void assertNonUniques(final ReadThreadingGraph assembler, String ... nonUniques) { + final Set actual = new HashSet<>(); + assembler.buildGraphIfNecessary(); + for ( final Kmer kmer : assembler.getNonUniqueKmers() ) actual.add(kmer.baseString()); + final Set expected = new HashSet<>(Arrays.asList(nonUniques)); + Assert.assertEquals(actual, expected); + } + + @Test + public void testSimpleHaplotypeRethreading() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(11); + final String ref = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG"; + final String alt = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG"; + assembler.addSequence("anonymous", getBytes(ref), true); + assembler.addSequence("anonymous", getBytes(alt), false); + assembler.buildGraphIfNecessary(); + Assert.assertNotEquals(ref.length() - 11 + 1,assembler.vertexSet().size(),"the number of vertex in the graph is the same as if there was no alternative sequence"); + Assert.assertEquals(ref.length() - 11 + 1 + 11,assembler.vertexSet().size(),"the number of vertex in the graph is not the same as if there is an alternative sequence"); + MultiDeBruijnVertex startAlt = assembler.findKmer(new Kmer(alt.getBytes(),20,11)); + Assert.assertNotNull(startAlt); + } + + @Test(enabled = ! DEBUG) + public void testNonUniqueMiddle() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + final String ref = "GACACACAGTCA"; + final String read1 = "GACAC---GTCA"; + final String read2 = "CAC---GTCA"; + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(read1), false); + assembler.addSequence(getBytes(read2), false); + assertNonUniques(assembler, "ACA", "CAC"); + } + + @Test(enabled = ! DEBUG) + public void testReadsCreateNonUnique() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + final String ref = "GCAC--GTCA"; // CAC is unique + final String read1 = "GCACACGTCA"; // makes CAC non unique because it has a duplication + final String read2 = "CACGTCA"; // shouldn't be allowed to match CAC as start + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(read1), false); + assembler.addSequence(getBytes(read2), false); +// assembler.convertToSequenceGraph().printGraph(new File("test.dot"), 0); + + assertNonUniques(assembler, "CAC"); + //assertSingleBubble(assembler, ref, "CAAAATCGGG"); + } + + @Test(enabled = ! DEBUG) + public void testCountingOfStartEdges() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + final String ref = "NNNGTCAAA"; // ref has some bases before start + final String read1 = "GTCAAA"; // starts at first non N base + + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(read1), false); + assembler.buildGraphIfNecessary(); +// assembler.printGraph(new File("test.dot"), 0); + + for ( final MultiSampleEdge edge : assembler.edgeSet() ) { + final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); + final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); + final boolean headerVertex = source.getSuffix() == 'N' || target.getSuffix() == 'N'; + if ( headerVertex ) { + Assert.assertEquals(edge.getMultiplicity(), 1, "Bases in the unique reference header should have multiplicity of 1"); + } else { + Assert.assertEquals(edge.getMultiplicity(), 2, "Should have multiplicity of 2 for any edge outside the ref header but got " + edge + " " + source + " -> " + target); + } + } + } + + @Test(enabled = !DEBUG) + public void testCountingOfStartEdgesWithMultiplePrefixes() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + assembler.increaseCountsThroughBranches = true; + final String ref = "NNNGTCAXX"; // ref has some bases before start + final String alt1 = "NNNCTCAXX"; // alt1 has SNP right after N + final String read = "TCAXX"; // starts right after SNP, but merges right before branch + + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(alt1), false); + assembler.addSequence(getBytes(read), false); + assembler.buildGraphIfNecessary(); + assembler.printGraph(new File("test.dot"), 0); + + final List oneCountVertices = Arrays.asList("NNN", "NNG", "NNC", "NGT", "NCT"); + final List threeCountVertices = Arrays.asList("CAX", "AXX"); + + for ( final MultiSampleEdge edge : assembler.edgeSet() ) { + final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); + final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); + final int expected = oneCountVertices.contains(target.getSequenceString()) ? 1 : (threeCountVertices.contains(target.getSequenceString()) ? 3 : 2); + Assert.assertEquals(edge.getMultiplicity(), expected, "Bases at edge " + edge + " from " + source + " to " + target + " has bad multiplicity"); + } + } + + @Test(enabled = !DEBUG) + public void testCyclesInGraph() { + + // b37 20:12655200-12655850 + final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; + + // SNP at 20:12655528 creates a cycle for small kmers + final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; + + final List reads = new ArrayList<>(); + for ( int index = 0; index < alt.length() - 100; index += 20 ) + reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); + + // test that there are cycles detected for small kmer + final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25); + rtgraph25.addSequence("ref", ref.getBytes(), true); + for ( final GATKSAMRecord read : reads ) + rtgraph25.addRead(read); + rtgraph25.buildGraphIfNecessary(); + Assert.assertTrue(rtgraph25.hasCycles()); + + // test that there are no cycles detected for large kmer + final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75); + rtgraph75.addSequence("ref", ref.getBytes(), true); + for ( final GATKSAMRecord read : reads ) + rtgraph75.addRead(read); + rtgraph75.buildGraphIfNecessary(); + Assert.assertFalse(rtgraph75.hasCycles()); + } + + @Test(enabled = !DEBUG) + public void testNsInReadsAreNotUsedForGraph() { + + final int length = 100; + final byte[] ref = Utils.dupBytes((byte)'A', length); + + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25); + rtgraph.addSequence("ref", ref, true); + + // add reads with Ns at any position + for ( int i = 0; i < length; i++ ) { + final byte[] bases = ref.clone(); + bases[i] = 'N'; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M"); + rtgraph.addRead(read); + } + rtgraph.buildGraphIfNecessary(); + + final SeqGraph graph = rtgraph.convertToSequenceGraph(); + Assert.assertEquals(new KBestHaplotypeFinder(graph, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); + } + +// TODO -- update to use determineKmerSizeAndNonUniques directly +// @DataProvider(name = "KmerSizeData") +// public Object[][] makeKmerSizeDataProvider() { +// List tests = new ArrayList(); +// +// // this functionality can be adapted to provide input data for whatever you might want in your data +// tests.add(new Object[]{3, 3, 3, Arrays.asList("ACG"), Arrays.asList()}); +// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAGACG"), Arrays.asList()}); +// +// tests.add(new Object[]{3, 3, 3, Arrays.asList("AAAAC"), Arrays.asList("AAA")}); +// tests.add(new Object[]{3, 4, 4, Arrays.asList("AAAAC"), Arrays.asList()}); +// tests.add(new Object[]{3, 5, 4, Arrays.asList("AAAAC"), Arrays.asList()}); +// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAAA"), Arrays.asList()}); +// tests.add(new Object[]{3, 4, 4, Arrays.asList("CAAAA"), Arrays.asList()}); +// tests.add(new Object[]{3, 5, 4, Arrays.asList("CAAAA"), Arrays.asList()}); +// tests.add(new Object[]{3, 5, 5, Arrays.asList("ACGAAAAACG"), Arrays.asList()}); +// +// for ( int maxSize = 3; maxSize < 20; maxSize++ ) { +// for ( int dupSize = 3; dupSize < 20; dupSize++ ) { +// final int expectedSize = Math.min(maxSize, dupSize); +// final String dup = Utils.dupString("C", dupSize); +// final List nonUnique = dupSize > maxSize ? Arrays.asList(Utils.dupString("C", maxSize)) : Collections.emptyList(); +// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("ACGT", "A" + dup + "GT"), nonUnique}); +// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("A" + dup + "GT", "ACGT"), nonUnique}); +// } +// } +// +// return tests.toArray(new Object[][]{}); +// } +// +// /** +// * Example testng test using MyDataProvider +// */ +// @Test(dataProvider = "KmerSizeData") +// public void testDynamicKmerSizing(final int min, final int max, final int expectKmer, final List seqs, final List expectedNonUniques) { +// final ReadThreadingGraph assembler = new ReadThreadingGraph(min, max); +// for ( String seq : seqs ) assembler.addSequence(seq.getBytes(), false); +// assembler.buildGraphIfNecessary(); +// Assert.assertEquals(assembler.getKmerSize(), expectKmer); +// assertNonUniques(assembler, expectedNonUniques.toArray(new String[]{})); +// } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java new file mode 100644 index 000000000..fe381513e --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java @@ -0,0 +1,66 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SequenceForKmersUnitTest extends BaseTest { + @Test + public void testNoCount() { + final byte[] seq = "ACGT".getBytes(); + final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, 1, true); + Assert.assertEquals(sk.name, "foo"); + Assert.assertEquals(sk.sequence, seq); + Assert.assertEquals(sk.start, 0); + Assert.assertEquals(sk.stop, seq.length); + Assert.assertEquals(sk.count, 1); + Assert.assertEquals(sk.isRef, true); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java new file mode 100644 index 000000000..509bf7465 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java @@ -0,0 +1,82 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +public class IndelRealignerUnitTest extends BaseTest { + + private SAMFileHeader header; + + @BeforeClass + public void setup() throws FileNotFoundException { + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + } + + @Test + public void realignAtContigBorderTest() { + final int contigEnd = header.getSequence(0).getSequenceLength(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "goodRead", 0, contigEnd - 1, 2); + read.setCigarString("2M"); + Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), false); + read.setCigarString("1M1D1M"); + Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), true); + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java new file mode 100644 index 000000000..3480b6775 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java @@ -0,0 +1,156 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +public class PairHMMIndelErrorModelUnitTest extends BaseTest { + + private SAMFileHeader header; + + @BeforeClass + public void setup() throws FileNotFoundException { + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + } + + private static final int refWindowStart = 1000; + private static final int refWindowEnd = 1100; + + @DataProvider(name = "ClipUpstreamProvider") + public Object[][] ClipUpstreamTestData() { + List tests = new ArrayList(); + + for ( final int readStart : Arrays.asList(900, 950, 990, 1000) ) { + for ( final int readLength : Arrays.asList(10, 50, 100) ) { + for ( final int delLength : Arrays.asList(0, 5, 10) ) { + tests.add(new Object[]{readStart, readLength, delLength}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ClipUpstreamProvider", enabled = true) + public void clipUpstreamTest(final int readStart, final int readLength, final int delLength) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + if ( delLength == 0 ) + read.setCigarString(readLength + "M"); + else + read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); + + final boolean result = PairHMMIndelErrorModel.mustClipUpstream(read, refWindowStart); + Assert.assertEquals(result, read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart); + } + + @DataProvider(name = "ClipDownstreamProvider") + public Object[][] ClipDownstreamTestData() { + List tests = new ArrayList(); + + for ( final int readStart : Arrays.asList(1000, 1050, 1090, 1100) ) { + for ( final int readLength : Arrays.asList(10, 50, 100) ) { + for ( final int delLength : Arrays.asList(0, 5, 10) ) { + tests.add(new Object[]{readStart, readLength, delLength}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ClipDownstreamProvider", enabled = true) + public void clipDownstreamTest(final int readStart, final int readLength, final int delLength) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + if ( delLength == 0 ) + read.setCigarString(readLength + "M"); + else + read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); + + final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd); + Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd); + } + + @Test + public void clipDownstreamAtBorderTest() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, 5, 10); + read.setCigarString("10M"); + Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 13), true); + Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 14), false); + } + + @Test + public void trimHaplotypesToNullAlleleTest() { + // we need a case where start and stop > haplotype coordinates + final int start = 100, stop = 100; + final Haplotype h = new Haplotype(new byte[]{(byte)'A'}, new UnvalidatingGenomeLoc("1", 0, 10, 10)); + final Map input = new HashMap(1); + input.put(Allele.create("A"), h); + + final Map output = PairHMMIndelErrorModel.trimHaplotypes(input, start, stop, null); + Assert.assertTrue(output.isEmpty()); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java new file mode 100644 index 000000000..8c8817fe6 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -0,0 +1,139 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class ReadBackedPhasingIntegrationTest extends WalkerTest { + + public static String baseTestString(String reference, String reads, String VCF, int cacheWindowSize, int maxPhaseSites, double phaseQualityThresh) { + return "-T ReadBackedPhasing" + + " -R " + reference + + " -I " + validationDataLocation + reads + + " --variant " + ( VCF.contains("phasing_test") ? privateTestDir : validationDataLocation) + VCF + + " --cacheWindowSize " + cacheWindowSize + + " --maxPhaseSites " + maxPhaseSites + + " --phaseQualityThresh " + phaseQualityThresh + + " -o %s" + + " --no_cmdline_in_header"; + } + + + @Test + public void test1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + + " -L chr20:332341-382503", + 1, + Arrays.asList("1bb034bd54421fe4884e3142ed92d47e")); + executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); + } + + @Test + public void test2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + + " -L chr20:1232503-1332503", + 1, + Arrays.asList("c12954252d4c8659b5ecf7517b277496")); + executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); + } + + @Test + public void test3() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + + " -L chr20:332341-382503", + 1, + Arrays.asList("0b945e30504d04e9c6fa659ca5c25ed5")); + executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); + } + + @Test + public void test4() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + + " -L chr20:332341-382503", + 1, + Arrays.asList("e9e8ef92d694ca71f29737fba26282f5")); + executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); + } + + @Test + public void test5() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + + " -L chr20:332341-482503", + 1, + Arrays.asList("b9c9347c760a06db635952bf4920fb48")); + executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); + } + + @Test + public void test6() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + + " -L chr20:652810-681757", + 1, + Arrays.asList("02c3a903842aa035ae379f16bc3d64ae")); + executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); + } + + @Test + public void test7() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) + + " -L chr20:332341-802503", + 1, + Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574")); + executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java new file mode 100644 index 000000000..62f6bcfbd --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java @@ -0,0 +1,172 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class OverhangFixingManagerUnitTest extends BaseTest { + + private CachingIndexedFastaSequenceFile referenceReader; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void setup() throws FileNotFoundException { + referenceReader = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(referenceReader.getSequenceDictionary()); + } + + @Test + public void testCleanSplices() { + + final OverhangFixingManager manager = new OverhangFixingManager(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + + final int offset = 10; + for ( int i = 0; i < OverhangFixingManager.MAX_SPLICES_TO_KEEP + 1; i++ ) + manager.addSplicePosition("20", offset + i, offset + 1 + i); + + final List splices = manager.getSplicesForTesting(); + + Assert.assertEquals(splices.size(), (OverhangFixingManager.MAX_SPLICES_TO_KEEP / 2) + 1); + + final int minStartPos = (OverhangFixingManager.MAX_SPLICES_TO_KEEP / 2) + offset; + + for ( final OverhangFixingManager.Splice splice : splices ) + Assert.assertTrue(splice.loc.getStart() >= minStartPos); + } + + @DataProvider(name = "OverhangTest") + public Object[][] makeOverhangData() { + final List tests = new ArrayList<>(); + for ( int leftRead : Arrays.asList(10, 20, 30, 40) ) { + for ( int rightRead : Arrays.asList(20, 30, 40, 50) ) { + if ( leftRead >= rightRead ) + continue; + for ( int leftSplice : Arrays.asList(10, 20, 30) ) { + for ( int rightSplice : Arrays.asList(20, 30, 40) ) { + if ( leftSplice >= rightSplice ) + continue; + + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc("1", leftRead, rightRead); + final GenomeLoc spliceLoc = genomeLocParser.createGenomeLoc("1", leftSplice, rightSplice); + tests.add(new Object[]{readLoc, spliceLoc}); + } + } + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OverhangTest") + public void testLeftOverhangs(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + final boolean isValidOverhang = readLoc.getStart() <= spliceLoc.getStop() && + readLoc.getStop() > spliceLoc.getStop() && + readLoc.getStart() > spliceLoc.getStart(); + Assert.assertEquals(OverhangFixingManager.isLeftOverhang(readLoc, spliceLoc), isValidOverhang, readLoc + " vs. " + spliceLoc); + } + + @Test(dataProvider = "OverhangTest") + public void testRightOverhangs(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + final boolean isValidOverhang = readLoc.getStop() >= spliceLoc.getStart() && + readLoc.getStop() < spliceLoc.getStop() && + readLoc.getStart() < spliceLoc.getStart(); + Assert.assertEquals(OverhangFixingManager.isRightOverhang(readLoc, spliceLoc), isValidOverhang, readLoc + " vs. " + spliceLoc); + } + + @DataProvider(name = "MismatchEdgeConditionTest") + public Object[][] makeMismatchEdgeConditionData() { + final List tests = new ArrayList<>(); + tests.add(new Object[]{null, 1, null, 1, 0}); + tests.add(new Object[]{null, 1, null, 1, 100}); + tests.add(new Object[]{new byte[4], 1, null, 1, 3}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MismatchEdgeConditionTest") + public void testMismatchEdgeCondition(final byte[] read, final int readStart, final byte[] ref, final int refStart, final int overhang) { + final OverhangFixingManager manager = new OverhangFixingManager(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + Assert.assertFalse(manager.overhangingBasesMismatch(read, readStart, ref, refStart, overhang)); + } + + @DataProvider(name = "MismatchTest") + public Object[][] makeMismatchData() { + final List tests = new ArrayList<>(); + + final byte[] AAAA = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A'}; + final byte[] AAAC = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'C'}; + final byte[] AAAAAA = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'A'}; + final byte[] AAAACA = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'C', (byte)'A'}; + final byte[] AAAACC = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'C', (byte)'C'}; + + tests.add(new Object[]{AAAA, 2, AAAA, 2, 2, false}); + tests.add(new Object[]{AAAA, 2, AAAC, 2, 2, true}); + tests.add(new Object[]{AAAAAA, 3, AAAACA, 3, 3, false}); + tests.add(new Object[]{AAAAAA, 3, AAAACC, 3, 3, true}); + tests.add(new Object[]{AAAAAA, 4, AAAACC, 4, 2, true}); + tests.add(new Object[]{AAAAAA, 2, AAAACC, 2, 3, false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MismatchTest") + public void testMismatch(final byte[] read, final int readStart, final byte[] ref, final int refStart, final int overhang, final boolean expected) { + final OverhangFixingManager manager = new OverhangFixingManager(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + Assert.assertEquals(manager.overhangingBasesMismatch(read, readStart, ref, refStart, overhang), expected, new String(read) + " vs. " + new String(ref) + " @" + overhang); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java new file mode 100644 index 000000000..87af68fc4 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java @@ -0,0 +1,112 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: ami + * Date: 12/5/13 + * Time: 1:04 PM + */ +public class SplitNCigarReadsIntegrationTest extends WalkerTest { + + @Test(enabled = false) + // contain reads without N's, with N's and with N's and I's + // TODO -- Ami: please put the bam file in the repo + public void testSplitWithInsertions() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withI.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("037c72fe1572efb63cccbe0a8dda3cb1")); + executeTest("test split N cigar reads with insertions", spec); + } + + @Test(enabled = false) + // contain reads without N's, with N's and with N's and D's, and also with more then one N element in the cigar. + // TODO -- Ami: please put the bam file in the repo + public void testSplitWithDeletions() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withD.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("8472005c16353715025353d6d453faf4")); + executeTest("test split N cigar reads with deletions", spec); + } + + @Test + public void testSplitsWithOverhangs() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("2832abc680c6b5a0219702ad5bf22f01")); + executeTest("test splits with overhangs", spec); + } + + @Test + public void testSplitsWithOverhangsNotClipping() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads --doNotFixOverhangs -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("59783610006bf7a1ccae57ee2016123b")); + executeTest("test splits with overhangs not clipping", spec); + } + + @Test + public void testSplitsWithOverhangs0Mismatches() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads --maxMismatchesInOverhang 0 -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("7547a5fc41ebfd1bbe62ce854b37b6ef")); + executeTest("test splits with overhangs 0 mismatches", spec); + } + + @Test + public void testSplitsWithOverhangs5BasesInOverhang() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads --maxBasesInOverhang 5 -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("f222eb02b003c08d4a606ab1bcb7931b")); + executeTest("test splits with overhangs 5 bases in overhang", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java new file mode 100644 index 000000000..d0f8280af --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java @@ -0,0 +1,203 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * + * Tests all possible (and valid) cigar strings that might contain any cigar elements. It uses a code that were written to test the ReadClipper walker. + * For valid cigar sting in length 8 there are few thousands options, with N in every possible option and with more than one N (for example 1M1N1M1N1M1N2M). + * The cigarElements array is used to provide all the possible cigar element that might be included. + * + * User: ami + * Date: 11/14/13 + * Time: 6:49 PM + */ +public class SplitNCigarReadsUnitTest extends BaseTest { + final static CigarElement[] cigarElements = { + new CigarElement(1, CigarOperator.HARD_CLIP), + new CigarElement(1, CigarOperator.SOFT_CLIP), + new CigarElement(1, CigarOperator.INSERTION), + new CigarElement(1, CigarOperator.DELETION), + new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH), + new CigarElement(1, CigarOperator.SKIPPED_REGION) + }; + + private CachingIndexedFastaSequenceFile referenceReader; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void setup() throws FileNotFoundException { + referenceReader = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(referenceReader.getSequenceDictionary()); + } + + private final class TestManager extends OverhangFixingManager { + public TestManager() { + super(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + } + } + + @Test(enabled = true) + public void splitReadAtN() { + final int cigarStringLength = 10; + final List cigarList = ReadClipperTestUtils.generateCigarList(cigarStringLength,cigarElements); + + // For Debugging use those lines (instead of above cigarList) to create specific read: + //------------------------------------------------------------------------------------ + // final GATKSAMRecord tmpRead = GATKSAMRecord.createRandomRead(6); + // tmpRead.setCigarString("1M1N1M"); + + // final List cigarList = new ArrayList<>(); + // cigarList.add(tmpRead.getCigar()); + + for(Cigar cigar: cigarList){ + + final int numOfSplits = numOfNElements(cigar.getCigarElements()); + + if(numOfSplits != 0 && isCigarDoesNotHaveEmptyRegionsBetweenNs(cigar)){ + + final TestManager manager = new TestManager(); + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + SplitNCigarReads.splitNCigarRead(read, manager); + List splitReads = manager.getReadsInQueueForTesting(); + final int expectedReads = numOfSplits+1; + Assert.assertEquals(splitReads.size(),expectedReads,"wrong number of reads after split read with cigar: "+cigar+" at Ns [expected]: "+expectedReads+" [actual value]: "+splitReads.size()); + final List readLengths = consecutiveNonNElements(read.getCigar().getCigarElements()); + int index = 0; + int offsetFromStart = 0; + for(final OverhangFixingManager.SplitRead splitRead: splitReads){ + int expectedLength = readLengths.get(index); + Assert.assertTrue(splitRead.read.getReadLength() == expectedLength, + "the "+index+" (starting with 0) split read has a wrong length.\n" + + "cigar of original read: "+cigar+"\n"+ + "expected length: "+expectedLength+"\n"+ + "actual length: "+splitRead.read.getReadLength()+"\n"); + assertBases(splitRead.read.getReadBases(), read.getReadBases(), offsetFromStart); + index++; + offsetFromStart += expectedLength; + } + } + } + } + + private int numOfNElements(final List cigarElements){ + int numOfNElements = 0; + for (CigarElement element: cigarElements){ + if (element.getOperator() == CigarOperator.SKIPPED_REGION) + numOfNElements++; + } + return numOfNElements; + } + + private static boolean isCigarDoesNotHaveEmptyRegionsBetweenNs(final Cigar cigar) { + boolean sawM = false; + boolean sawS = false; + + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (cigarElement.getOperator().equals(CigarOperator.SKIPPED_REGION)) { + if(!sawM && !sawS) + return false; + sawM = false; + sawS = false; + } + if (cigarElement.getOperator().equals(CigarOperator.MATCH_OR_MISMATCH)) + sawM = true; + if (cigarElement.getOperator().equals(CigarOperator.SOFT_CLIP)) + sawS = true; + + } + if(!sawS && !sawM) + return false; + return true; + } + + private List consecutiveNonNElements(final List cigarElements){ + final LinkedList results = new LinkedList<>(); + int consecutiveLength = 0; + for(CigarElement element: cigarElements){ + final CigarOperator op = element.getOperator(); + if(op.equals(CigarOperator.MATCH_OR_MISMATCH) || op.equals(CigarOperator.SOFT_CLIP) || op.equals(CigarOperator.INSERTION)){ + consecutiveLength += element.getLength(); + } + else if(op.equals(CigarOperator.SKIPPED_REGION)) + { + if(consecutiveLength != 0){ + results.addLast(consecutiveLength); + consecutiveLength = 0; + } + } + } + if(consecutiveLength != 0) + results.addLast(consecutiveLength); + return results; + } + + private void assertBases(final byte[] actualBase, final byte[] expectedBase, final int startIndex) { + for (int i = 0; i < actualBase.length; i++) { + Assert.assertEquals(actualBase[i], expectedBase[startIndex + i],"unmatched bases between: "+ Arrays.toString(actualBase)+"\nand:\n"+Arrays.toString(expectedBase)+"\nat position: "+i); + } + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java new file mode 100644 index 000000000..2ae904e65 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java @@ -0,0 +1,95 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.simulatereads; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SimulateReadsForVariantsIntegrationTest extends WalkerTest { + + @Test + public void testDefaults() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("dd9e17a9c268578e903ecd4ca0a4a335")); + executeTest("testVariants", spec); + } + + @Test + public void testReadLength() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-RL 70 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("d7388376ffd4d3826d48a5be0be70632")); + executeTest("testReadLength", spec); + } + + @Test + public void testErrorRate() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-ER 40 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("6c9bf583f4b2708d6b82f54516474b7b")); + executeTest("testErrorRate", spec); + } + + @Test + public void testPlatformTag() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-RGPL SOLID -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("26db391f223ead74d786006a502029d8")); + executeTest("testPlatformTag", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java new file mode 100644 index 000000000..9a1422608 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java @@ -0,0 +1,180 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 7/25/13 + */ + +public class VariantDataManagerUnitTest extends BaseTest { + + @Test + public final void testCalculateSortOrder() { + final double passingQual = 400.0; + final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + + VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); + + final List theData = new ArrayList<>(); + final VariantDatum datum1 = new VariantDatum(); + datum1.atTrainingSite = true; + datum1.failingSTDThreshold = false; + datum1.originalQual = passingQual; + datum1.annotations = new double[]{0.0,-10.0,10.0}; + datum1.isNull = new boolean[]{false, false, false}; + theData.add(datum1); + + final VariantDatum datum2 = new VariantDatum(); + datum2.atTrainingSite = true; + datum2.failingSTDThreshold = false; + datum2.originalQual = passingQual; + datum2.annotations = new double[]{0.0,-9.0,15.0}; + datum2.isNull = new boolean[]{false, false, false}; + theData.add(datum2); + + final VariantDatum datum3 = new VariantDatum(); + datum3.atTrainingSite = false; + datum3.failingSTDThreshold = false; + datum3.originalQual = passingQual; + datum3.annotations = new double[]{0.0,1.0,999.0}; + datum3.isNull = new boolean[]{false, false, false}; + theData.add(datum3); + + final VariantDatum datum4 = new VariantDatum(); + datum4.atTrainingSite = false; + datum4.failingSTDThreshold = false; + datum4.originalQual = passingQual; + datum4.annotations = new double[]{0.015,2.0,1001.11}; + datum4.isNull = new boolean[]{false, false, false}; + theData.add(datum4); + + vdm.setData(theData); + + final double[] meanVector = new double[3]; + for( int iii = 0; iii < meanVector.length; iii++ ) { + meanVector[iii] = vdm.mean(iii, true); + } + final List order = vdm.calculateSortOrder(meanVector); + Assert.assertArrayEquals(new int[]{2,1,0}, ArrayUtils.toPrimitive(order.toArray(new Integer[order.size()]))); + } + + @Test + public final void testDownSamplingTrainingData() { + final int MAX_NUM_TRAINING_DATA = 5000; + final double passingQual = 400.0; + final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA; + + VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); + final List theData = new ArrayList<>(); + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = true; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = false; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + vdm.setData(theData); + final List trainingData = vdm.getTrainingData(); + + Assert.assertTrue( trainingData.size() == MAX_NUM_TRAINING_DATA ); + } + + @Test + public final void testDropAggregateData() { + final int MAX_NUM_TRAINING_DATA = 5000; + final double passingQual = 400.0; + final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA; + + VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); + final List theData = new ArrayList<>(); + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = true; + datum.isAggregate = false; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = false; + datum.isAggregate = true; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + vdm.setData(theData); + vdm.dropAggregateData(); + + for( final VariantDatum datum : vdm.getData() ) { + Assert.assertFalse( datum.isAggregate ); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java new file mode 100644 index 000000000..c4de50b25 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -0,0 +1,329 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { + private static class VRTest { + String inVCF; + String aggregateVCF; + String tranchesMD5; + String recalMD5; + String cutVCFMD5; + + public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { + this.inVCF = inVCF; + this.tranchesMD5 = tranchesMD5; + this.recalMD5 = recalMD5; + this.cutVCFMD5 = cutVCFMD5; + } + + public VRTest(String inVCF, String aggregateVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { + this.inVCF = inVCF; + this.aggregateVCF = aggregateVCF; + this.tranchesMD5 = tranchesMD5; + this.recalMD5 = recalMD5; + this.cutVCFMD5 = cutVCFMD5; + } + + @Override + public String toString() { + return "VRTest{inVCF='" + inVCF +"'}"; + } + } + + VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", + "41e2d951a17de433fe378bb3d9ec75d4", // tranches + "04336b2453202f286da05b69e57f66ed", // recal file + "d29fd0bdc1c8c3a171e10d29f7ffeaec"); // cut VCF + + VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", + "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches + "1b33c10be7d8bf8e9accd11113835262", // recal file + "4700d52a06f2ef3a5882719b86911e51"); // cut VCF + + @DataProvider(name = "VRTest") + public Object[][] createData1() { + return new Object[][]{ {lowPass} }; + } + + @DataProvider(name = "VRAggregateTest") + public Object[][] createData2() { + return new Object[][]{ {lowPassPlusExomes} }; + } + + @Test(dataProvider = "VRTest") + public void testVariantRecalibrator(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an HaplotypeScore -an HRun" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRTest",dependsOnMethods="testVariantRecalibrator") + public void testApplyRecalibration(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles + executeTest("testApplyRecalibration-"+params.inVCF, spec); + } + + @Test(dataProvider = "VRAggregateTest") + public void testVariantRecalibratorAggregate(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -aggregate " + params.aggregateVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an HaplotypeScore -an MQ" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorAggregate-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRAggregateTest",dependsOnMethods="testVariantRecalibratorAggregate") + public void testApplyRecalibrationAggregate(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles + executeTest("testApplyRecalibrationAggregate-"+params.inVCF, spec); + } + + VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", + "3ad7f55fb3b072f373cbce0b32b66df4", // tranches + "e747c08131d58d9a4800720f6ca80e0c", // recal file + "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF + + @DataProvider(name = "VRBCFTest") + public Object[][] createVRBCFTest() { + return new Object[][]{ {bcfTest} }; + } + + @Test(dataProvider = "VRBCFTest") + public void testVariantRecalibratorWithBCF(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:10,000,000-20,000,000" + + " --no_cmdline_in_header" + + " -an AC " + // integer value + " -an QD -an ReadPosRankSum -an FS -an InbreedingCoeff " + // floats value + " -mG 2 "+ + " -recalFile %s" + + " -tranchesFile %s", + 2, + Arrays.asList("bcf", "txt"), + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRBCFTest", dependsOnMethods="testVariantRecalibratorWithBCF") + public void testApplyRecalibrationWithBCF(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:10,000,000-20,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); + executeTest("testApplyRecalibration-"+params.inVCF, spec); + } + + + VRTest indelUnfiltered = new VRTest( + validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . + "9a331328370889168a7aa3a625f73620", // tranches + "2cbbd146d68c40200b782e0226f71976", // recal file + "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF + + VRTest indelFiltered = new VRTest( + validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS + "9a331328370889168a7aa3a625f73620", // tranches + "2cbbd146d68c40200b782e0226f71976", // recal file + "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF + + @DataProvider(name = "VRIndelTest") + public Object[][] createTestVariantRecalibratorIndel() { + return new Object[][]{ {indelUnfiltered}, {indelFiltered} }; + } + + @Test(dataProvider = "VRIndelTest") + public void testVariantRecalibratorIndel(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an ReadPosRankSum -an HaplotypeScore" + + " -mode INDEL -mG 3" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") + public void testApplyRecalibrationIndel(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " -mode INDEL" + + " -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" + + " -input " + params.inVCF + + " -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation + executeTest("testApplyRecalibrationIndel-" + params.inVCF, spec); + } + + @Test + public void testApplyRecalibrationSnpAndIndelTogether() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " --no_cmdline_in_header" + + " -input " + privateTestDir + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", + Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); + executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); + } + + @Test(enabled = true) + public void testApplyRecalibrationSnpAndIndelTogetherExcludeFiltered() throws Exception { + final String base = "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " --excludeFiltered -ts_filter_level 90.0" + + " --no_cmdline_in_header" + + " -input " + privateTestDir + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal"; + + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File VCF = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).first.get(0); + + for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { + if( VC != null ) { + Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file + } + } + } +} + diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java new file mode 100644 index 000000000..14341e401 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java @@ -0,0 +1,69 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class CalculateGenotypePosteriorsIntegrationTest extends WalkerTest { + + @Test(enabled = true) + public void testUsingDiscoveredAF() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CalculateGenotypePosteriors --no_cmdline_in_header" + + " -o %s" + + " -R " + b37KGReference + + " -L 20:10,000,000-10,100,000" + + " -V " + validationDataLocation + "1000G.phase3.broad.withGenotypes.chr20.1MB.vcf", + 1, + Arrays.asList("e1adedc7e1d63e384187b24b7ded4410")); + executeTest("testUsingDiscoveredAF", spec); + } + +} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java new file mode 100644 index 000000000..03d136290 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -0,0 +1,168 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class CombineGVCFsIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + + privateTestDir + "gvcfExample1.vcf -V " + privateTestDir + "gvcfExample2.vcf" + args; + } + + @Test + public void testOneStartsBeforeTwoAndEndsAfterwards() throws Exception { + final String cmd = baseTestString(" -L 1:69485-69509"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneStartsBeforeTwoAndEndsAfterwards", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 2, "Observed: " + allVCs); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69491); + Assert.assertEquals(first.getEnd(), 69497); + Assert.assertEquals(first.getGenotypes().size(), 2); + Assert.assertTrue(first.getGenotype("NA1").isCalled()); + Assert.assertTrue(first.getGenotype("NA2").isNoCall()); + + final VariantContext second = allVCs.get(1); + Assert.assertEquals(second.getStart(), 69498); + Assert.assertEquals(second.getEnd(), 69506); + Assert.assertEquals(second.getGenotypes().size(), 2); + Assert.assertTrue(second.getGenotype("NA1").isCalled()); + Assert.assertTrue(second.getGenotype("NA2").isCalled()); + } + + @Test + public void testTwoSpansManyBlocksInOne() throws Exception { + final String cmd = baseTestString(" -L 1:69512-69634"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testTwoSpansManyBlocksInOne", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 5); + } + + @Test + public void testOneHasAltAndTwoHasNothing() throws Exception { + final String cmd = baseTestString(" -L 1:69511"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneHasAltAndTwoHasNothing", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 1); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69511); + Assert.assertEquals(first.getEnd(), 69511); + Assert.assertEquals(first.getGenotypes().size(), 2); + } + + @Test + public void testOneHasAltAndTwoHasRefBlock() throws Exception { + final String cmd = baseTestString(" -L 1:69635"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneHasAltAndTwoHasRefBlock", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 1); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69635); + Assert.assertEquals(first.getEnd(), 69635); + Assert.assertEquals(first.getNAlleles(), 3); + Assert.assertEquals(first.getGenotypes().size(), 2); + } + + @Test + public void testOneHasDeletionAndTwoHasRefBlock() throws Exception { + final String cmd = baseTestString(" -L 1:69772-69783"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneHasDeletionAndTwoHasRefBlock", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 3); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69772); + Assert.assertEquals(first.getEnd(), 69776); + Assert.assertEquals(first.getNAlleles(), 3); + Assert.assertEquals(first.getGenotypes().size(), 2); + + final VariantContext second = allVCs.get(1); + Assert.assertEquals(second.getStart(), 69773); + Assert.assertEquals(second.getEnd(), 69774); + Assert.assertEquals(second.getGenotypes().size(), 2); + + final VariantContext third = allVCs.get(2); + Assert.assertEquals(third.getStart(), 69775); + Assert.assertEquals(third.getEnd(), 69783); + Assert.assertEquals(third.getGenotypes().size(), 2); + } + + @Test + public void testMD5s() throws Exception { + final String cmd = baseTestString(" -L 1:69485-69791"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("aecdfa9eb32b802cd629e9f811ef15fd")); + spec.disableShadowBCF(); + executeTest("testMD5s", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java new file mode 100644 index 000000000..fb54ab400 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -0,0 +1,207 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +/** + * Tests CombineVariants + */ +public class CombineVariantsIntegrationTest extends WalkerTest { + // + // TODO TODO TODO TODO TODO TODO TODO TODO + // TODO TODO TODO TODO TODO TODO TODO TODO + // + // TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE lenientVCFProcessing ARGUMENTS + // + // TODO TODO TODO TODO TODO TODO TODO TODO + // TODO TODO TODO TODO TODO TODO TODO TODO + // TODO TODO TODO TODO TODO TODO TODO TODO + // + private static String baseTestString(String args) { + return baseTestString(args, b36KGReference); + } + + private static String baseTestString(String args, String ref) { + return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; + //return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; + } + + private void cvExecuteTest(final String name, final WalkerTestSpec spec, final boolean parallel) { + spec.disableShadowBCF(); + if ( parallel ) + executeTestParallel(name, spec); + else + executeTest(name, spec); + } + + public void test1InOut(String file, String md5) { + test1InOut(file, md5, ""); + } + + public void test1InOut(String file, String md5, String args) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -priority v1 -V:v1 " + validationDataLocation + file + args), + 1, + Arrays.asList(md5)); + cvExecuteTest("testInOut1--" + file, spec, true); + } + + public void combine2(String file1, String file2, String args, String md5) { + combine2(file1, file2, args, md5, true); + } + + public void combine2(String file1, String file2, String args, String md5, final boolean parallel) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -priority v1,v2 -V:v1 " + validationDataLocation + file1 + " -V:v2 "+ validationDataLocation + file2 + args), + 1, + Arrays.asList(md5)); + cvExecuteTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec, parallel); + } + + public void combineSites(String args, String md5) { + String file1 = "1000G_omni2.5.b37.sites.vcf"; + String file2 = "hapmap_3.3.b37.sites.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s -R " + b37KGReference + + " -L 1:1-10,000,000 -V:omni " + validationDataLocation + file1 + + " -V:hm3 " + validationDataLocation + file2 + args, + 1, + Arrays.asList(md5)); + cvExecuteTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec, true); + } + + public void combinePLs(String file1, String file2, String md5) { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s -R " + b36KGReference + " -priority v1,v2 -V:v1 " + privateTestDir + file1 + " -V:v2 " + privateTestDir + file2, + 1, + Arrays.asList(md5)); + cvExecuteTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec, true); + } + + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "6469fce8a5cd5a0f77e5ac5d9e9e192b", " -U LENIENT_VCF_PROCESSING"); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "a4cedaa83d54e34cafc3ac4b80acf5b4", " -setKey foo -U LENIENT_VCF_PROCESSING"); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "ac58a5fde17661e2a19004ca954d9781", " -setKey null -U LENIENT_VCF_PROCESSING"); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "67a8076e30b4bca0ea5acdc9cd26a4e0"); } // official project VCF files in tabix format + + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "909c6dc74eeb5ab86f8e74073eb0c1d6"); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "381875b3280ba56eef0152e56f64f68d"); } + + @Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "f0ce3fb83d4ad9ba402d7cb11cd000c3"); } + + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "4efdf983918db822e4ac13d911509576"); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "848d4408ee953053d2307cefebc6bd6d"); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "629656bfef7713c23f3a593523503b2f"); } + + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e54d0dcf14f90d5c8e58b45191dd0219"); } + + @Test public void uniqueSNPs() { + // parallelism must be disabled because the input VCF is malformed (DB=0) and parallelism actually fixes this which breaks the md5s + combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "e5ea6ac3905bd9eeea1a2ef5d2cb5af7", true); + } + + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "def52bcd3942bbe39cd7ebe845c4f206"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "5f61145949180bf2a0cd342d8e064860"); } + + @Test public void threeWayWithRefs() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:NA19240_BGI "+validationDataLocation+"NA19240.BGI.RG.vcf" + + " -V:NA19240_ILLUMINA "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + + " -V:NA19240_WUGSC "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + + " -V:denovoInfo "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + + " -setKey centerSet" + + " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + + " -U LENIENT_VCF_PROCESSING" + + " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + + " -genotypeMergeOptions UNIQUIFY -L 1"), + 1, + Arrays.asList("58e6281df108c361e99673a501ee4749")); + cvExecuteTest("threeWayWithRefs", spec, true); + } + + // complex examples with filtering, indels, and multiple alleles + public void combineComplexSites(String args, String md5) { + String file1 = "combine.1.vcf"; + String file2 = "combine.2.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V:one " + privateTestDir + file1 + + " -V:two " + privateTestDir + file2 + args, + 1, + Arrays.asList(md5)); + cvExecuteTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec, true); + } + + @Test public void complexTestFull() { combineComplexSites("", "9d989053826ffe5bef7c4e05ac51bcca"); } + @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "4f38d9fd30a7ae83e2a7dec265a28772"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } + @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } + + @Test + public void combineDBSNPDuplicateSites() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132, + 1, + Arrays.asList("aa926eae333208dc1f41fe69dc95d7a6")); + cvExecuteTest("combineDBSNPDuplicateSites:", spec, true); + } + + @Test + public void combineLeavesUnfilteredRecordsUnfiltered() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s " + + " -R " + b37KGReference + + " -V " + privateTestDir + "combineVariantsLeavesRecordsUnfiltered.vcf", + 1, + Arrays.asList("f8c014d0af7e014475a2a448dc1f9cef")); + cvExecuteTest("combineLeavesUnfilteredRecordsUnfiltered: ", spec, false); + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java new file mode 100755 index 000000000..6ece527ce --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -0,0 +1,740 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.StringBufferInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class ConcordanceMetricsUnitTest extends BaseTest { + + private static ReferenceSequenceFile seq; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + } + public static String HEADER_BASE = "##fileformat=VCFv4.0\n" + + "##filedate=2010-06-21\n"+ + "##reference=NCBI36\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##FILTER=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"; + public static String TEST_1_HEADER = HEADER_BASE + "test1_sample1\ttest1_sample2\ttest1_sample3\n"; + public static String TEST_2_HEADER = HEADER_BASE + "test2_sample1\ttest2_sample2\n"; + public static String TEST_3_HEADER_1 = HEADER_BASE + "test3_sample1\ttest3_sample2\ttest3_sample3\ttest3_sample4\ttest3_sample5\n"; + public static String TEST_3_HEADER_2 = HEADER_BASE + "test3_sample6\ttest3_sample7\ttest3_sample8\ttest3_sample9\ttest3_sample10\n"; + public static String TEST_3_HEADER_3 = HEADER_BASE + "test3_sample3\ttest3_sample6\ttest3_sample7\ttest3_sample8\ttest3_sample9\ttest3_sample10\n"; + + + private Pair getData1() { + + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + + Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); + Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); + + Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(alt_C,alt_C)); + + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); + VariantContextBuilder eval_1_builder = new VariantContextBuilder(); + VariantContextBuilder truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + + Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + return testData; + } + + @Test(enabled=true) + public void testSimpleComparison() { + Pair data = getData1(); + VariantContext eval = data.getFirst(); + VariantContext truth = data.getSecond(); + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + metrics.update(eval,truth); + Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); + Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],1); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); + } + + private Pair getData2() { + + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); + + Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T)); + Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); + + Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); + Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(alt_C,alt_C)); + + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); + VariantContextBuilder eval_1_builder = new VariantContextBuilder(); + VariantContextBuilder truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.alleles(Arrays.asList(reference_A,alt_C,alt_T)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + + Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + return testData; + } + + @Test(enabled=true) + public void testMismatchingAlleleInAlleleSubset() { + Pair data = getData2(); + VariantContext eval = data.getFirst(); + VariantContext truth = data.getSecond(); + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + metrics.update(eval,truth); + Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); + Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],1); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); + + // now flip them around + + eval = data.getSecond(); + truth = data.getFirst(); + codec = new VCFCodec(); + evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + metrics.update(eval,truth); + Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); + Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[1][2],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[1][2],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[3][2],1); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],0); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUBSET_TRUTH.ordinal()],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); + } + + private Pair getData3() { + + Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true); + Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base}); + Allele alt_A = Allele.create(BaseUtils.Base.A.base); + Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base}); + + Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT)); + Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); + Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_ACT,alt_A)); + + Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_AC)); + Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); + Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_ACT,alt_A)); + + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 5); + VariantContextBuilder eval_1_builder = new VariantContextBuilder(); + VariantContextBuilder truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.alleles(Arrays.asList(reference_ACT,alt_ATT,alt_A)); + truth_1_builder.alleles(Arrays.asList(reference_ACT,alt_AC,alt_A)); + eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + + Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + return testData; + } + + @Test(enabled=true) + public void testComplex() { + Pair data = getData3(); + VariantContext eval = data.getFirst(); + VariantContext truth = data.getSecond(); + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + metrics.update(eval,truth); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample1").getnMismatchingAlt(),1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[3][3],1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[1][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][2],1); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[3][3],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],0); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); + } + + private Pair getData4() { + + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); + + Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); + Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); + + Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); + Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); + + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); + VariantContextBuilder eval_1_builder = new VariantContextBuilder(); + VariantContextBuilder truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.alleles(Arrays.asList(reference_A,alt_C,alt_T)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + + Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + return testData; + } + + @Test(enabled=true) + public void testNoCalls() { + Pair data = getData4(); + VariantContext eval = data.getFirst(); + VariantContext truth = data.getSecond(); + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + metrics.update(eval,truth); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[0][2],1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][0],1); + } + + private Pair getData5() { + + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); + + Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList(0)); + Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); + + Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); + Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", new ArrayList(0)); + + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); + VariantContextBuilder eval_1_builder = new VariantContextBuilder(); + VariantContextBuilder truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.alleles(Arrays.asList(reference_A,alt_C,alt_T)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + + Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + return testData; + } + + @Test(enabled=true) + public void testMissing() { + Pair data = getData5(); + VariantContext eval = data.getFirst(); + VariantContext truth = data.getSecond(); + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + metrics.update(eval,truth); + Assert.assertTrue(eval.getGenotype("test1_sample2").getType().equals(GenotypeType.UNAVAILABLE)); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[0][2],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[4][2],1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][0],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][4],1); + } + + private List> getData6() { + + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + + + // site 1 - + // sample 1: hom-ref/hom-ref + // sample 2: het/hom-ref + + Genotype sam_2_1_1_eval = GenotypeBuilder.create("test2_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_2_2_1_eval = GenotypeBuilder.create("test2_sample2", Arrays.asList(reference_A,alt_C)); + + Genotype sam_2_1_1_truth = GenotypeBuilder.create("test2_sample1", Arrays.asList(reference_A,reference_A)); + Genotype sam_2_2_1_truth = GenotypeBuilder.create("test2_sample2", Arrays.asList(reference_A,reference_A)); + + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); + VariantContextBuilder eval_1_builder = new VariantContextBuilder(); + VariantContextBuilder truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + eval_1_builder.genotypes(Arrays.asList(sam_2_1_1_eval,sam_2_2_1_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_2_1_1_truth,sam_2_2_1_truth)); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + + Pair testDataSite1 = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); + + // site 2 - + // sample 1: no-call/hom-ref + // sample 2: hom-var/hom-var + + Genotype sam_2_1_2_eval = GenotypeBuilder.create("test2_sample1",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); + Genotype sam_2_2_2_eval = GenotypeBuilder.create("test2_sample2",Arrays.asList(alt_T,alt_T)); + Genotype sam_2_1_2_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(reference_A,reference_A)); + Genotype sam_2_2_2_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(alt_T,alt_T)); + + loc = genomeLocParser.createGenomeLoc("chr1", 4, 4); + eval_1_builder = new VariantContextBuilder(); + truth_1_builder = new VariantContextBuilder(); + + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + eval_1_builder.alleles(Arrays.asList(reference_A,alt_T)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_T)); + eval_1_builder.genotypes(Arrays.asList(sam_2_1_2_eval,sam_2_2_2_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_2_1_2_truth,sam_2_2_2_truth)); + + Pair testDataSite2 = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + Allele alt_G = Allele.create(BaseUtils.Base.G.base); + + // site 3 - + // sample 1: alleles do not match + // sample 2: het/het + Genotype sam_2_1_3_eval = GenotypeBuilder.create("test2_sample1",Arrays.asList(alt_G,alt_T)); + Genotype sam_2_2_3_eval = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_T)); + Genotype sam_2_1_3_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(alt_T,alt_T)); + Genotype sam_2_2_3_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_T)); + + loc = genomeLocParser.createGenomeLoc("chr1",5,5); + eval_1_builder = new VariantContextBuilder(); + truth_1_builder = new VariantContextBuilder(); + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + eval_1_builder.alleles(Arrays.asList(reference_A,alt_T,alt_G)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_T)); + eval_1_builder.genotypes(Arrays.asList(sam_2_1_3_eval,sam_2_2_3_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_2_1_3_truth,sam_2_2_3_truth)); + + Pair testDataSite3 = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + // site 4 - + // sample 1: unavailable/het + // sample 2: unavailable/ref + Genotype sam_2_1_4_eval = GenotypeBuilder.create("test2_sample1",new ArrayList(0)); + Genotype sam_2_2_4_eval = GenotypeBuilder.create("test2_sample2",new ArrayList(0)); + Genotype sam_2_1_4_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(reference_A,alt_T)); + Genotype sam_2_2_4_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,reference_A)); + + loc = genomeLocParser.createGenomeLoc("chr1",6,6); + eval_1_builder = new VariantContextBuilder(); + truth_1_builder = new VariantContextBuilder(); + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + eval_1_builder.alleles(Arrays.asList(reference_A,alt_T)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_T)); + eval_1_builder.genotypes(Arrays.asList(sam_2_1_4_eval,sam_2_2_4_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_2_1_4_truth,sam_2_2_4_truth)); + + Pair testDataSite4 = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + // site 5 - + // sample 1: hom-var/no-call + // sample 2: het/het + Genotype sam_2_1_5_eval = GenotypeBuilder.create("test2_sample1",Arrays.asList(alt_C,alt_C)); + Genotype sam_2_2_5_eval = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_C)); + Genotype sam_2_1_5_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); + Genotype sam_2_2_5_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_C)); + + loc = genomeLocParser.createGenomeLoc("chr1",7,7); + eval_1_builder = new VariantContextBuilder(); + truth_1_builder = new VariantContextBuilder(); + eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); + eval_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); + eval_1_builder.genotypes(Arrays.asList(sam_2_1_5_eval,sam_2_2_5_eval)); + truth_1_builder.genotypes(Arrays.asList(sam_2_1_5_truth,sam_2_2_5_truth)); + + Pair testDataSite5 = new Pair(eval_1_builder.make(),truth_1_builder.make()); + + return Arrays.asList(testDataSite1,testDataSite2,testDataSite3,testDataSite4,testDataSite5); + } + + @Test(enabled=true) + public void testMultiSite() { + int[][] sample1_expected = new int[GenotypeType.values().length][GenotypeType.values().length]; + int[][] sample2_expected = new int[GenotypeType.values().length][GenotypeType.values().length]; + // order: no-call,ref,het,hom-var,unavailable,mixed + sample1_expected[0] = new int[]{0,1,0,0,0,0}; + sample2_expected[0] = new int[]{0,0,0,0,0,0}; + sample1_expected[1] = new int[]{0,1,0,0,0,0}; + sample2_expected[1] = new int[]{0,0,0,0,0,0}; + sample1_expected[2] = new int[]{0,0,0,0,0,0}; + sample2_expected[2] = new int[]{0,1,2,0,0,0}; + sample1_expected[3] = new int[]{1,0,0,0,0,0}; + sample2_expected[3] = new int[]{0,0,0,1,0,0}; + sample1_expected[4] = new int[]{0,0,1,0,0,0}; + sample2_expected[4] = new int[]{0,1,0,0,0,0}; + + List> data = getData6(); + + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + + for ( Pair contextPair : data ) { + VariantContext eval = contextPair.getFirst(); + VariantContext comp = contextPair.getSecond(); + logger.warn(eval.toString()); + logger.warn(comp.toString()); + Assert.assertTrue(eval != null); + Assert.assertTrue(comp != null); + Assert.assertTrue(eval.getGenotype("test2_sample1") != null); + Assert.assertTrue(comp.getGenotype("test2_sample1") != null); + Assert.assertTrue(eval.getGenotype("test2_sample2") != null); + Assert.assertTrue(comp.getGenotype("test2_sample2") != null); + metrics.update(eval,comp); + } + + int[][] sample1_observed = metrics.getGenotypeConcordance("test2_sample1").getTable(); + int[][] sample2_observed = metrics.getGenotypeConcordance("test2_sample2").getTable(); + for ( GenotypeType eType : GenotypeType.values() ) { + for ( GenotypeType cType : GenotypeType.values() ) { + Assert.assertEquals(sample1_expected[eType.ordinal()][cType.ordinal()],sample1_observed[eType.ordinal()][cType.ordinal()]); + Assert.assertEquals(sample2_expected[eType.ordinal()][cType.ordinal()],sample2_observed[eType.ordinal()][cType.ordinal()]); + } + } + } + + @Test(enabled=true) + public void testNRD_testNRS_testMargins() { + Pair data = getData3(); + VariantContext eval = data.getFirst(); + VariantContext truth = data.getSecond(); + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + int[][] table = metrics.getOverallGenotypeConcordance().getTable(); + // set up the table + table[0] = new int[] {30, 12, 7, 5, 6, 0}; + table[1] = new int[] {10, 100, 5, 1, 7, 1}; + table[2] = new int[] {5, 7, 150, 3, 3, 1}; + table[3] = new int[] {3, 2, 6, 50, 1, 0}; + table[4] = new int[] {10, 6, 3, 3, 2, 0}; + table[5] = new int[] {12, 0, 34, 20, 10, 0}; + double EXPEC_NRS = 0.8969957; + double EXPEC_NRD = 0.1071429; + double EXPEC_OGC = 0.92592592; // (100+150+50)/(100+5+1+150+7+3+50+2+6) + Assert.assertEquals(EXPEC_NRS,metrics.getOverallNRS(),1e-7); + Assert.assertEquals(EXPEC_NRD,metrics.getOverallNRD(),1e-7); + Assert.assertEquals(EXPEC_OGC,metrics.getOverallOGC(),1e-7); + int EXPEC_EVAL_REF = 124; + int EXPEC_EVAL_HET = 169; + int EXPEC_EVAL_VAR = 62; + int EXPEC_COMP_REF = 127; + int EXPEC_COMP_HET = 205; + int EXPEC_COMP_VAR = 82; + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnEvalGenotypes(GenotypeType.HOM_REF),EXPEC_EVAL_REF); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnEvalGenotypes(GenotypeType.HET),EXPEC_EVAL_HET); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnEvalGenotypes(GenotypeType.HOM_VAR),EXPEC_EVAL_VAR); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnCompGenotypes(GenotypeType.HOM_REF),EXPEC_COMP_REF); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnCompGenotypes(GenotypeType.HET),EXPEC_COMP_HET); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnCompGenotypes(GenotypeType.HOM_VAR),EXPEC_COMP_VAR); + } + + @Test(enabled=true) + public void testRobustness() { + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_1)))); + VCFHeader disjointCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_2)))); + VCFHeader overlapCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_3)))); + ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader,false); + ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader,false); + + // test what happens if you put in disjoint sets and start making requests + Assert.assertEquals(0,disjointMetrics.getPerSampleGenotypeConcordance().size()); + String msg = "No Exception Thrown"; + try { + disjointMetrics.getGenotypeConcordance("test3_sample4"); + } catch ( Exception e) { + msg = e.getMessage(); + } + Assert.assertEquals("Attempted to request the concordance table for sample test3_sample4 on which it was not calculated",msg); + + // test that the overlapping sample is in the overlapping table (basically do this without throwing an exception) + overlapMetrics.getGenotypeConcordance("test3_sample3"); + + String msg2 = "No Exception Thrown"; + try { + disjointMetrics.getGenotypeConcordance("test3_sample4"); + } catch ( Exception e) { + msg2 = e.getMessage(); + } + Assert.assertEquals("Attempted to request the concordance table for sample test3_sample4 on which it was not calculated",msg2); + + // test what happens if you try to calculate NRS and NRD on an empty table + Assert.assertEquals(disjointMetrics.getOverallNRD(), 1.0, 1e-16); + Assert.assertEquals(disjointMetrics.getOverallNRS(), 0.0, 1e-16); + } + + public List> getData7() { + + Allele ref1 = Allele.create(BaseUtils.Base.T.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.G.base); + Allele alt3 = Allele.create(BaseUtils.Base.A.base); + + GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1); + VariantContextBuilder site1Eval = new VariantContextBuilder(); + VariantContextBuilder site1Comp = new VariantContextBuilder(); + + + // site 1: eval superset comp + site1Eval.loc(loc1.getContig(),loc1.getStart(),loc1.getStop()); + site1Comp.loc(loc1.getContig(),loc1.getStart(),loc1.getStop()); + site1Eval.alleles(Arrays.asList(ref1,alt1,alt2)); + site1Comp.alleles(Arrays.asList(ref1,alt2)); + site1Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt2))); + site1Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt2)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt2))); + + // site 2: eval subset comp + GenomeLoc loc2 = genomeLocParser.createGenomeLoc("chr1",2,2); + VariantContextBuilder site2Eval = new VariantContextBuilder(); + VariantContextBuilder site2Comp = new VariantContextBuilder(); + site2Eval.loc(loc2.getContig(),loc2.getStart(),loc2.getStop()); + site2Comp.loc(loc2.getContig(),loc2.getStart(),loc2.getStop()); + site2Eval.alleles(Arrays.asList(ref1,alt1)); + site2Comp.alleles(Arrays.asList(ref1,alt1,alt3)); + site2Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); + site2Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt3)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); + + // site 3: eval only + GenomeLoc loc3 = genomeLocParser.createGenomeLoc("chr1",3,3); + VariantContextBuilder site3Eval = new VariantContextBuilder(); + VariantContextBuilder site3Comp = new VariantContextBuilder(); + site3Eval.loc(loc3.getContig(),loc3.getStart(),loc3.getStop()); + site3Comp.loc(loc3.getContig(),loc3.getStart(),loc3.getStop()); + site3Eval.alleles(Arrays.asList(ref1,alt1)); + site3Comp.alleles(Arrays.asList(ref1,alt1)); + site3Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); + site3Comp.genotypes(GenotypeBuilder.create("test2_sample1",new ArrayList(0)),GenotypeBuilder.create("test2_sample2",new ArrayList(0))); + + // site 4: comp only - monomorphic + GenomeLoc loc4 = genomeLocParser.createGenomeLoc("chr1",4,4); + VariantContextBuilder site4Eval = new VariantContextBuilder(); + VariantContextBuilder site4Comp = new VariantContextBuilder(); + site4Eval.loc(loc4.getContig(),loc4.getStart(),loc4.getStop()); + site4Comp.loc(loc4.getContig(),loc4.getStart(),loc4.getStop()); + site4Eval.alleles(Arrays.asList(ref1,alt1)); + site4Comp.alleles(Arrays.asList(ref1,alt1)); + site4Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,ref1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,ref1))); + site4Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); + + // site 5: overlapping + GenomeLoc loc5 = genomeLocParser.createGenomeLoc("chr1",5,5); + VariantContextBuilder site5Eval = new VariantContextBuilder(); + VariantContextBuilder site5Comp = new VariantContextBuilder(); + site5Eval.loc(loc5.getContig(),loc5.getStart(),loc5.getStop()); + site5Comp.loc(loc5.getContig(),loc5.getStart(),loc5.getStop()); + site5Eval.alleles(Arrays.asList(ref1,alt1,alt3)); + site5Comp.alleles(Arrays.asList(ref1,alt1,alt3)); + site5Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(alt1,alt3))); + site5Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(alt1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(alt3,alt3))); + + // site 6: some non-matching alts + GenomeLoc loc6 = genomeLocParser.createGenomeLoc("chr1",6,6); + VariantContextBuilder site6Eval = new VariantContextBuilder(); + VariantContextBuilder site6Comp = new VariantContextBuilder(); + site6Eval.loc(loc6.getContig(),loc6.getStart(),loc6.getStop()); + site6Comp.loc(loc6.getContig(),loc6.getStart(),loc6.getStop()); + site6Eval.alleles(Arrays.asList(ref1,alt1,alt2)); + site6Comp.alleles(Arrays.asList(ref1,alt1,alt3)); + site6Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt2))); + site6Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt3))); + + // site 7: matching with no-calls + GenomeLoc loc7 = genomeLocParser.createGenomeLoc("chr1",7,7); + VariantContextBuilder site7Eval = new VariantContextBuilder(); + VariantContextBuilder site7Comp = new VariantContextBuilder(); + site7Eval.loc(loc7.getContig(),loc7.getStart(),loc7.getStop()); + site7Comp.loc(loc7.getContig(),loc7.getStart(),loc7.getStop()); + site7Eval.alleles(Arrays.asList(ref1,alt1)); + site7Comp.alleles(Arrays.asList(ref1,alt1)); + site7Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL))); + site7Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); + + Pair site1 = new Pair(site1Eval.make(),site1Comp.make()); + Pair site2 = new Pair(site2Eval.make(),site2Comp.make()); + Pair site3 = new Pair(site3Eval.make(),site3Comp.make()); + Pair site4 = new Pair(site4Eval.make(),site4Comp.make()); + Pair site5 = new Pair(site5Eval.make(),site5Comp.make()); + Pair site6 = new Pair(site6Eval.make(),site6Comp.make()); + Pair site7 = new Pair(site7Eval.make(),site7Comp.make()); + + return Arrays.asList(site1,site2,site3,site4,site5,site6,site7); + } + + @Test(enabled = true) + public void testSites() { + VCFCodec codec = new VCFCodec(); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); + + List> data = getData7(); + + int idx = 0; + int[] expecNotMatch = new int[]{0,0,0,0,0,1,1}; + for ( Pair varPair : data ) { + metrics.update(varPair.getFirst(),varPair.getSecond()); + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH),expecNotMatch[idx]); + logger.info(idx); + idx++; + } + + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH),1); + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH),2); + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_ONLY),1); + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.TRUTH_ONLY),1); + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUBSET_TRUTH),1); + Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH),1); + + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java new file mode 100644 index 000000000..1ca23caba --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -0,0 +1,116 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class GenotypeGVCFsIntegrationTest extends WalkerTest { + + private static String baseTestString(String args, String ref) { + return "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + ref + args; + } + + @Test(enabled = true) + public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -L 20:10,000,000-20,000,000", b37KGReference), + 1, + Arrays.asList("2be5f6f7e7f79841108906555d548683")); + executeTest("combineSingleSamplePipelineGVCF", spec); + } + + @Test(enabled = false) // TODO -- reenable when this option works + public void combineSingleSamplePipelineGVCF_includeNonVariants() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -inv -L 20:10,000,000-10,010,000", b37KGReference), + 1, + Arrays.asList("de957075796512cb9f333f77515e97d5")); + executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); + } + + @Test(enabled = true) + public void combineSingleSamplePipelineGVCF_addDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), + 1, + Arrays.asList("e3c7452277898fece54bf60af9588666")); + executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); + } + + @Test(enabled = true) + public void testJustOneSample() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + + " -V " + privateTestDir + "gvcfExample1.vcf", + 1, + Arrays.asList("bee009201ec3ad7b4f42f913e7ef1367")); + executeTest("testJustOneSample", spec); + } + + @Test(enabled = true) + public void testSamplesWithDifferentLs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + + " -V " + privateTestDir + "gvcfExample1.vcf" + + " -V " + privateTestDir + "gvcfExample2.vcf", + 1, + Arrays.asList("67410d8ac490e3c9d19ba7a4cceaf8fb")); + executeTest("testSamplesWithDifferentLs", spec); + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java new file mode 100644 index 000000000..e2f17a65f --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -0,0 +1,77 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests LeftAlignAndTrimVariants + */ +public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { + + @Test + public void testLeftAlignment() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", + 1, + Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2")); + executeTest("test left alignment", spec); + } + + @Test + public void testLeftAlignmentWithTrimmingAndMultialleliecs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split", + 1, + Arrays.asList("d12468cf08cfd14354f781d5f42b279f")); + executeTest("test left alignment with trimming and hard multiple alleles", spec); + + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java new file mode 100644 index 000000000..87f664905 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java @@ -0,0 +1,487 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: 12/8/13 + */ + +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + + @BeforeSuite + public void setup() { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + } + + private String arraysEq(int[] a, int[] b) { + if ( a.length != b.length ) { + return String.format("NEQ: %s | %s",Arrays.toString(a),Arrays.toString(b)); + } + for ( int idx = 0; idx < a.length; idx++) { + if ( a[idx] - b[idx] > 1 || b[idx] - a[idx] > 1) { + return String.format("NEQ: %s | %s",Arrays.toString(a),Arrays.toString(b)); + } + } + + return ""; + } + + private int[] _mleparse(List s) { + int[] mle = new int[s.size()]; + for ( int idx = 0; idx < mle.length; idx ++) { + mle[idx] = s.get(idx); + } + + return mle; + } + + private Genotype makeGwithPLs(String sample, Allele a1, Allele a2, double[] pls) { + Genotype gt = new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); + if ( pls != null && pls.length > 0 ) { + Assert.assertNotNull(gt.getPL()); + Assert.assertTrue(gt.getPL().length > 0); + for ( int i : gt.getPL() ) { + Assert.assertTrue(i >= 0); + } + Assert.assertNotEquals(Arrays.toString(gt.getPL()),"[0]"); + } + return gt; + } + + private Genotype makeG(String sample, Allele a1, Allele a2) { + return GenotypeBuilder.create(sample, Arrays.asList(a1, a2)); + } + + private Genotype makeG(String sample, Allele a1, Allele a2, int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); + } + + private VariantContext makeVC(String source, List alleles, Genotype... genotypes) { + int start = 10; + int stop = start; // alleles.contains(ATC) ? start + 3 : start; + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(Arrays.asList(genotypes)).filters(null).make(); + } + + @Test + private void testCalculatePosteriorNoExternalData() { + VariantContext test1 = makeVC("1",Arrays.asList(Aref,T), makeG("s1",Aref,T,20,0,10), + makeG("s2",T,T,60,40,0), + makeG("s3",Aref,Aref,0,30,90)); + test1 = new VariantContextBuilder(test1).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,3).make(); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(test1, new ArrayList(), 0, 0.001, true, false, false); + Genotype test1exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.20686, -0.03073215, -1.20686}); + Assert.assertTrue(test1exp1.hasPL()); + Genotype test1exp2 = makeGwithPLs("s2",T,T,new double[]{-6.000066, -3.823938, -6.557894e-05}); + Genotype test1exp3 = makeGwithPLs("s3",Aref,Aref,new double[]{-0.0006510083, -2.824524, -9.000651}); + Assert.assertEquals("java.util.ArrayList",test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY).getClass().getCanonicalName()); + Assert.assertEquals(arraysEq(test1exp1.getPL(), _mleparse((List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List)test1result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List)test1result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + + // AA AB BB AC BC CC + // AA AC CC AT CT TT + VariantContext test2 = makeVC("2",Arrays.asList(Aref,C,T), + makeG("s1",Aref,T,30,10,60,0,15,90), + makeG("s2",Aref,C,40,0,10,30,40,80), + makeG("s3",Aref,Aref,0,5,8,15,20,40), + makeG("s4",C,T,80,40,12,20,0,10)); + test2 = new VariantContextBuilder(test2).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,new ArrayList(Arrays.asList(2,2))).make(); + VariantContext test2result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(test2,new ArrayList(),5,0.001,true,false,false); + Genotype test2exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.647372, -1.045139, -6.823193, -0.04513873, -2.198182, -9.823193}); + Genotype test2exp2 = makeGwithPLs("s2",Aref,C,new double[]{-3.609957, -0.007723248, -1.785778, -3.007723, -4.660767, -8.785778}); + Genotype test2exp3 = makeGwithPLs("s3",Aref,Aref,new double[] {-0.06094877, -0.9587151, -2.03677,-1.958715, -3.111759, -5.23677}); + Genotype test2exp4 = makeGwithPLs("s4",C,T,new double[]{-7.016534, -3.4143, -1.392355, -1.4143, -0.06734388, -1.192355}); + Assert.assertEquals(arraysEq(test2exp1.getPL(),(int[]) _mleparse((List)test2result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp2.getPL(),(int[]) _mleparse((List)test2result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp3.getPL(),(int[]) _mleparse((List)test2result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp4.getPL(),(int[]) _mleparse((List)test2result.getGenotype(3).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + } + + @Test + private void testCalculatePosteriorSamplePlusExternal() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(3); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,2).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("3",Arrays.asList(Aref,T))).attribute(VCFConstants.ALLELE_COUNT_KEY,4).attribute(VCFConstants.ALLELE_NUMBER_KEY,22).make()); + supplTest1.add(makeVC("4",Arrays.asList(Aref,T), + makeG("s_1",T,T), + makeG("s_2",Aref,T))); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + // the counts here are ref=30, alt=14 + Genotype test1exp1 = makeGwithPLs("t1",T,T,new double[]{-3.370985, -1.415172, -0.01721766}); + Genotype test1exp2 = makeGwithPLs("t2",Aref,T,new double[]{-1.763792, -0.007978791, -3.010024}); + Genotype test1exp3 = makeGwithPLs("t3",Aref,T,new double[]{-2.165587, -0.009773643, -1.811819}); + Assert.assertEquals(arraysEq(test1exp1.getPL(),_mleparse((List) test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List) test1result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List) test1result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + + VariantContext testNonOverlapping = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,3,1,0)); + List other = Arrays.asList(makeVC("2",Arrays.asList(Aref,C),makeG("s2",C,C,10,2,0))); + VariantContext test2result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testNonOverlapping,other,0,0.001,true,false,false); + Genotype test2exp1 = makeGwithPLs("SGV",T,T,new double[]{-4.078345, -3.276502, -0.0002661066}); + Assert.assertEquals(arraysEq(test2exp1.getPL(),_mleparse((List) test2result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + } + + @Test + private void testCalculatePosteriorHOM_VARtoHET() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,1,0)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,500).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY)); + Assert.assertTrue(GP[2] > GP[1]); + } + + @Test + private void testCalculatePosteriorHETtoHOM_VAR() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,0,1)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,900).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY)); + Assert.assertTrue(GP[2] < GP[1]); + } + + @Test + private void testCalculatePosteriorHOM_REFtoHET() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,0,1,40)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,500).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY)); + Assert.assertTrue(GP[0] > GP[1]); + } + + @Test + private void testCalculatePosteriorHETtoHOM_REF() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,1,0,40)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,100).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY)); + Assert.assertTrue(GP[0] < GP[1]); + } + + @Test + private void testMLEACgreaterThanAN() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,11).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + } + + @Test (expectedExceptions = {UserException.class}) + private void testWrongNumberACvalues() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.ALLELE_COUNT_KEY,5).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + } + + @Test (expectedExceptions = {UserException.class}) + private void testWrongNumberMLEACvalues() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,5).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + } + + @Test + private void testMultipleACvalues() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + } + + @Test + private void testMultipleMLEACvalues() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(1); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + } + + private double[] pl2gl(int[] pl) { + double[] gl = new double[pl.length]; + for ( int idx = 0; idx < gl.length; idx++ ) { + gl[idx] = pl[idx]/(-10.0); + } + + return MathUtils.normalizeFromLog10(gl,true); + } + + @Test + private void testCalculatePosterior() { + int[][] likelihood_PLs = new int[][]{ + new int[]{3,0,3}, + new int[]{99,0,99}, + new int[]{50,20,0}, + new int[]{10,0,50}, + new int[]{80,60,0}, + new int[]{0,42,44}}; + + int[] altCounts = new int[]{10,40,90}; + int[] altAlleleNum = new int[]{100,500,1000}; + + double[] expected_post_10_100 = new double[] { + 9.250326e-03, 3.020208e-01, 6.887289e-01, + 7.693433e-12, 1.000000e+00, 5.728111e-10, + 1.340156e-07, 2.192982e-03, 9.978069e-01, + 6.073718e-03, 9.938811e-01, 4.522159e-05, + 1.343101e-10, 2.197802e-07, 9.999998e-01, + 9.960193e-01, 1.028366e-03, 2.952290e-03 + }; + + double[] expected_post_10_500 = new double[] { + 4.226647e-04, 7.513277e-02, 9.244446e-01, + 1.413080e-12, 1.000000e+00, 3.090662e-09, + 4.570232e-09, 4.071661e-04, 9.995928e-01, + 1.120916e-03, 9.986339e-01, 2.451646e-04, + 4.572093e-12, 4.073320e-08, 1.000000e+00, + 9.151689e-01, 5.144399e-03, 7.968675e-02 + }; + + double[] expected_post_10_1000 = new double[] { + 1.077685e-04, 3.870477e-02, 9.611875e-01, + 6.994030e-13, 1.000000e+00, 6.237975e-09, + 1.120976e-09, 2.017756e-04, 9.997982e-01, + 5.549722e-04, 9.989500e-01, 4.949797e-04, + 1.121202e-12, 2.018163e-08, 1.000000e+00, + 7.318346e-01, 8.311615e-03, 2.598538e-01 + }; + + double[] expected_post_40_100 = new double[] { + 1.102354e-01, 6.437516e-01, 2.460131e-01, + 4.301328e-11, 1.000000e+00, 9.599306e-11, + 4.422850e-06, 1.294493e-02, 9.870507e-01, + 3.303763e-02, 9.669550e-01, 7.373032e-06, + 4.480868e-09, 1.311474e-06, 9.999987e-01, + 9.997266e-01, 1.846199e-04, 8.882157e-05 + }; + + double[] expected_post_40_500 = new double[] { + 5.711785e-03, 2.557266e-01, 7.385617e-01, + 5.610428e-12, 1.000000e+00, 7.254558e-10, + 7.720262e-08, 1.732352e-03, 9.982676e-01, + 4.436495e-03, 9.955061e-01, 5.736604e-05, + 7.733659e-11, 1.735358e-07, 9.999998e-01, + 9.934793e-01, 1.406575e-03, 5.114153e-03 + }; + + double[] expected_post_40_1000 = new double[] { + 1.522132e-03, 1.422229e-01, 8.562549e-01, + 2.688330e-12, 1.000000e+00, 1.512284e-09, + 1.776184e-08, 8.317737e-04, 9.991682e-01, + 2.130611e-03, 9.977495e-01, 1.198547e-04, + 1.777662e-11, 8.324661e-08, 9.999999e-01, + 9.752770e-01, 2.881677e-03, 2.184131e-02 + }; + + double[] expected_post_90_100 = new double[] { + 6.887289e-01, 3.020208e-01, 9.250326e-03, + 5.728111e-10, 1.000000e+00, 7.693433e-12, + 6.394346e-04, 1.405351e-01, 8.588255e-01, + 3.127146e-01, 6.872849e-01, 4.200075e-07, + 7.445327e-07, 1.636336e-05, 9.999829e-01, + 9.999856e-01, 1.386699e-05, 5.346906e-07 + }; + + double[] expected_post_90_500 = new double[] { + 2.528165e-02, 4.545461e-01, 5.201723e-01, + 1.397100e-11, 1.000000e+00, 2.874546e-10, + 4.839050e-07, 4.360463e-03, 9.956391e-01, + 1.097551e-02, 9.890019e-01, 2.258221e-05, + 4.860244e-10, 4.379560e-07, 9.999996e-01, + 9.986143e-01, 5.677671e-04, 8.179741e-04 + }; + + double[] expected_post_90_1000 = new double[] { + 7.035938e-03, 2.807708e-01, 7.121932e-01, + 6.294627e-12, 1.000000e+00, 6.371561e-10, + 9.859771e-08, 1.971954e-03, 9.980279e-01, + 4.974874e-03, 9.949748e-01, 5.035678e-05, + 9.879252e-11, 1.975850e-07, 9.999998e-01, + 9.947362e-01, 1.255272e-03, 4.008518e-03 + }; + + double[][] expectations = new double[][] { + expected_post_10_100, + expected_post_10_500, + expected_post_10_1000, + expected_post_40_100, + expected_post_40_500, + expected_post_40_1000, + expected_post_90_100, + expected_post_90_500, + expected_post_90_1000 + }; + + int testIndex = 0; + for ( int altCount : altCounts ) { + for ( int numAlt : altAlleleNum ) { + double[] knownCounts = new double[2]; + knownCounts[0] = altCount; + knownCounts[1] = numAlt-altCount; + int expected_index = 0; + for ( int gl_index = 0; gl_index < likelihood_PLs.length; gl_index++ ) { + double[] post = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(likelihood_PLs[gl_index]), knownCounts, 2); + for ( int i = 0; i < post.length; i++ ) { + double expected = expectations[testIndex][expected_index++]; + double observed = Math.pow(10.0,post[i]); + double err = Math.abs( (expected-observed)/expected ); + Assert.assertTrue(err < 1e-4, String.format("Counts: %s | Expected: %e | Observed: %e | pre %s | prior %s | post %s", + Arrays.toString(knownCounts), expected,observed, Arrays.toString(pl2gl(likelihood_PLs[gl_index])), + Arrays.toString(PosteriorLikelihoodsUtils.getDirichletPrior(knownCounts,2)),Arrays.toString(post))); + } + } + testIndex++; + } + } + } + + private boolean arraysApproxEqual(double[] a, double[] b, double tol) { + if ( a.length != b.length ) { + return false; + } + + for ( int idx = 0; idx < a.length; idx++ ) { + if ( Math.abs(a[idx]-b[idx]) > tol ) { + return false; + } + } + + return true; + } + + private String errMsgArray(double[] a, double[] b) { + return String.format("Expected %s, Observed %s", Arrays.toString(a), Arrays.toString(b)); + } + + @Test + private void testPosteriorMultiAllelic() { + // AA AB BB AC BC CC AD BD CD DD + int[] PL_one = new int[] {40,20,30,0,15,25}; + int[] PL_two = new int[] {0,20,10,99,99,99}; + int[] PL_three = new int[] {50,40,0,30,30,10,20,40,80,50}; + int[] PL_four = new int[] {99,90,85,10,5,30,40,20,40,30,0,12,20,14,5}; + int[] PL_five = new int[] {60,20,30,0,40,10,8,12,18,22,40,12,80,60,20}; + double[] counts_one = new double[]{100.001,40.001,2.001}; + double[] counts_two = new double[]{2504.001,16.001,218.001}; + double[] counts_three = new double[]{10000.001,500.001,25.001,0.001}; + double[] counts_four = new double[]{4140.001,812.001,32.001,104.001,12.001}; + double[] counts_five = new double[]{80.001,40.001,8970.001,200.001,1922.001}; + + double expected_one[] = new double[] { -2.684035, -0.7852596, -2.4735, -0.08608339, -1.984017, -4.409852 }; + double expected_two[] = new double[] { -5.736189e-05, -3.893688, -5.362878, -10.65938, -12.85386, -12.0186}; + double expected_three[] = new double[] {-2.403234, -2.403276, -0.004467802, -2.70429, -4.005319, -3.59033, -6.102247, -9.403276, -14.70429, -13.40284}; + double expected_four[] = new double[] {-7.828677, -7.335196, -7.843136, -0.7395892, -0.947033, -5.139092, -3.227715, + -1.935159, -5.339552, -4.124552, -0.1655353, -2.072979, -4.277372, -3.165498, -3.469589 }; + double expected_five[] = new double[] { -9.170334, -5.175724, -6.767055, -0.8250021, -5.126027, -0.07628661, -3.276762, + -3.977787, -2.227065, -4.57769, -5.494041, -2.995066, -7.444344, -7.096104, -2.414187}; + + double[] post1 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_one),counts_one,2); + double[] post2 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_two),counts_two,2); + double[] post3 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_three),counts_three,2); + double[] post4 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_four),counts_four,2); + double[] post5 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_five),counts_five,2); + + double[] expecPrior5 = new double[] {-4.2878195, -4.2932090, -4.8845400, -1.9424874, -2.2435120, -0.1937719, -3.5942477, + -3.8952723, -1.5445506, -3.4951749, -2.6115263, -2.9125508, -0.5618292, -2.2135895, + -1.5316722}; + + Assert.assertTrue(arraysApproxEqual(expecPrior5, PosteriorLikelihoodsUtils.getDirichletPrior(counts_five,2),1e-5),errMsgArray(expecPrior5,PosteriorLikelihoodsUtils.getDirichletPrior(counts_five,2))); + + Assert.assertTrue(arraysApproxEqual(expected_one,post1,1e-6),errMsgArray(expected_one,post1)); + Assert.assertTrue(arraysApproxEqual(expected_two,post2,1e-5),errMsgArray(expected_two,post2)); + Assert.assertTrue(arraysApproxEqual(expected_three,post3,1e-5),errMsgArray(expected_three,post3)); + Assert.assertTrue(arraysApproxEqual(expected_four,post4,1e-5),errMsgArray(expected_four,post4)); + Assert.assertTrue(arraysApproxEqual(expected_five,post5,1e-5),errMsgArray(expected_five,post5)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java new file mode 100644 index 000000000..703c044d4 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -0,0 +1,359 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SelectVariantsIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; + } + + @Test + public void testDiscordanceNoSampleSpecified() { + String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + + b37hapmapGenotypes + " -disc " + testFile + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", + 1, + Arrays.asList("954415f84996d27b07d00855e96d33a2") + ); + spec.disableShadowBCF(); + + executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec); + } + + @Test + public void testRepeatedLineSelection() { + String testfile = privateTestDir + "test.dup.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -sn B -sn C --variant " + testfile), + 1, + Arrays.asList("125d1c9fa111cd38dfa2ff3900f16b57") + ); + + executeTest("testRepeatedLineSelection--" + testfile, spec); + } + + @Test + public void testDiscordance() { + String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + + b37hapmapGenotypes + " -disc " + testFile + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", + 1, + Arrays.asList("ca1b5226eaeaffb78d4abd9d2ee10c43") + ); + spec.disableShadowBCF(); + + executeTest("testDiscordance--" + testFile, spec); + } + + @Test + public void testComplexSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelection--" + testfile, spec); + } + + @Test + public void testComplexSelectionWithNonExistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); + } + + @Test + public void testNonExistingFieldSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -env -ef -select 'foo!=0||DP>0' --variant " + testfile), + 1, + Arrays.asList("44e77cea624cfff2b8acc3a4b30485cb") // should yield empty vcf because the foo!=0 will yield complete expression false + ); + spec.disableShadowBCF(); + executeTest("testNonExistingSelection--" + testfile, spec); + } + + @Test + public void testSampleExclusionFromFileAndSeparateSample() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, + 1, + Arrays.asList("1f5c72951a35667c4bdf1be153787e27") + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusion--" + testfile, spec); + } + + @Test + public void testSampleExclusionJustFromFile() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sf " + samplesFile + " --variant " + testfile, + 1, + Arrays.asList("875d7e00ac8081e87ab9fb1b20c83677") + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusion--" + testfile, spec); + } + + @Test + public void testSampleInclusionWithNonexistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, + 1, + UserException.BadInput.class + ); + spec.disableShadowBCF(); + + executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); + } + + + @Test + public void testConcordance() { + String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + + b37hapmapGenotypes + " --variant " + testFile + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", + 1, + Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46") + ); + spec.disableShadowBCF(); + + executeTest("testConcordance--" + testFile, spec); + } + + @Test + public void testVariantTypeSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") + ); + + executeTest("testVariantTypeSelection--" + testFile, spec); + } + + @Test + public void testIndelLengthSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", + 1, + Arrays.asList("004589868ca5dc887e2dff876b4cc797") + ); + + executeTest("testIndelLengthSelection--" + testFile, spec); + } + + @Test + public void testUsingDbsnpName() { + String testFile = privateTestDir + "combine.3.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") + ); + + executeTest("testUsingDbsnpName--" + testFile, spec); + } + + @Test + public void testRemoveMLE() { + String testFile = privateTestDir + "vcfexample.withMLE.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") + ); + + executeTest("testRemoveMLE--" + testFile, spec); + } + + @Test + public void testKeepOriginalAC() { + String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalAC -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ad7e8b25e431a3229a78cec063876559") + ); + + executeTest("testKeepOriginalAC--" + testFile, spec); + } + + @Test + public void testKeepOriginalACAndENV() { + String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("e9b8292212545684cdb163423329ee7e") + ); + + executeTest("testKeepOriginalACAndENV--" + testFile, spec); + } + + @Test + public void testMultipleRecordsAtOnePosition() { + String testFile = privateTestDir + "selectVariants.onePosition.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("44f7c47395ca5b2afef5313f592c8cea") + ); + + executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec); + } + + @Test + public void testNoGTs() { + String testFile = privateTestDir + "vcf4.1.example.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542") + ); + + executeTest("testNoGTs--" + testFile, spec); + } + + @Test + public void testSelectFromMultiAllelic() { + String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; + String samplesFile = privateTestDir + "GIH.samples.list"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") + ); + executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); + } + + @Test + public void testMultiAllelicAnnotationOrdering() { + String testfile = privateTestDir + "multi-allelic-ordering.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header " + + "-sn SAMPLE-CC -sn SAMPLE-CT -sn SAMPLE-CA --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("8fe7cdca8638461909262cb0769b2527") + ); + executeTest("test multi allelic annotation ordering --" + testfile, spec); + } + + @Test() + public void testFileWithoutInfoLineInHeader() { + testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", IllegalStateException.class); + } + + @Test() + public void testFileWithoutInfoLineInHeaderWithOverride() { + testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null); + } + + private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) { + final String testFile = privateTestDir + "missingHeaderLine.vcf"; + final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + + testFile + " -o %s --no_cmdline_in_header" + + (expectedException == null ? " -U LENIENT_VCF_PROCESSING" : ""); + WalkerTestSpec spec = + expectedException != null + ? new WalkerTestSpec(cmd, 1, expectedException) + : new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + + executeTest(name, spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java new file mode 100644 index 000000000..68eb1cc41 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -0,0 +1,110 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SelectVariantsParallelIntegrationTest extends WalkerTest { + + private class ParallelSelectTestProvider extends TestDataProvider { + final String reference; + final String args; + final String md5; + final int nt; + + private ParallelSelectTestProvider(final String reference, final String args, final String md5, final int nt) { + super(ParallelSelectTestProvider.class); + this.reference = reference; + this.args = args; + this.md5 = md5; + this.nt = nt; + } + + public final String getCmdLine() { + return "-T SelectVariants -R " + reference + " -o %s -L 1 --no_cmdline_in_header -nt " + nt + " " + args; + } + + public String toString() { + return String.format("ParallelSelectVariants nt=%d args=%s", nt, args); + } + } + + @DataProvider(name = "ParallelSelectTest") + public Object[][] makeParallelSelectTestProvider() { + for ( int nt : Arrays.asList(1, 2, 4) ) { + { // original MAF test + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + String args = " -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile; + new ParallelSelectTestProvider(b36KGReference, args, "4386fbb258dcef4437495a37f5a83c53", nt); + } + { // new tests on b37 using testdir VCF + final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; + final String args = "-select 'DP > 30' -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "c64b45a14d41b1e5cddbe036b47e7519", nt); + } + { // AD and PL decoding race condition + final String testfile = privateTestDir + "race_condition.vcf"; + final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "62e6156387d6e91bd2b08ef649cb1129", nt); + } + } + + return ParallelSelectTestProvider.getTests(ParallelSelectTestProvider.class); + } + + @Test(dataProvider = "ParallelSelectTest") + public void testParallelSelectTestProvider(final ParallelSelectTestProvider cfg) { + final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); + executeTest(cfg.toString(), spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/RandomDNA.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/RandomDNA.java new file mode 100644 index 000000000..426462ed2 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/RandomDNA.java @@ -0,0 +1,125 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils; + +import java.util.Random; + +/** + * Random DNA sequence generator. + * + *

+ * Returned bases are always in upper case and one of the valid four nocleotides 'A', 'C', 'G' and 'T'. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class RandomDNA { + + private Random random; + + /** + * Constructs a new random DNA generator. + * + *

+ * The seed would be the default which would depend on system properties and the current time as + * described in {@link Random} documentation. + *

+ */ + @SuppressWarnings("unused") + public RandomDNA() { + random = new Random(); + } + + /** + * Constructs a new random DNA generator providing a seed. + * + * @param seed the random number generator seed. + */ + public RandomDNA(final long seed) { + random = new Random(seed); + } + + /** + * Updates the content of a byte array with a random base sequence. + * + *

+ * The whole array will be filled with new base values. + *

+ * + * @param destination the array to update. + * + * @throws NullPointerException if {@code destination} is {@code null}. + */ + public void nextBases(final byte[] destination) { + random.nextBytes(destination); + for (int i = 0; i < destination.length; i++) { + final int ord = destination[i] & 0x03; + switch (ord) { + case 0: destination[i] = 'A'; break; + case 1: destination[i] = 'C'; break; + case 2: destination[i] = 'G'; break; + case 3: destination[i] = 'T'; break; + default: throw new IllegalStateException("this cannot be happening!!!"); + } + } + } + + /** + * Returns a random RNA sequence of bases. + * @param size the length of the sequence. + * + * @throws IllegalArgumentException if {@code size} is negative. + * @return never {@code null}. + */ + public byte[] nextBases(final int size) { + if (size < 0) throw new IllegalArgumentException("the size cannot be negative"); + final byte[] result = new byte[size]; + nextBases(result); + return result; + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java new file mode 100644 index 000000000..cda022ab8 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -0,0 +1,392 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class GVCFWriterUnitTest extends BaseTest { + private static class MockWriter implements VariantContextWriter { + final List emitted = new ArrayList<>(); + boolean headerWritten = false; + boolean closed = false; + + @Override + public void writeHeader(VCFHeader header) { + headerWritten = true; + } + + @Override + public void close() { + closed = true; + } + + @Override + public void add(VariantContext vc) { + emitted.add(vc); + } + } + + private MockWriter mockWriter; + private List standardPartition = Arrays.asList(1, 10, 20); + private Allele REF = Allele.create("N", true); + private Allele ALT = Allele.create("A"); + private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + private final String SAMPLE_NAME = "XXYYZZ"; + + @BeforeMethod + public void setUp() throws Exception { + mockWriter = new MockWriter(); + } + + @Test + public void testHeaderWriting() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.writeHeader(new VCFHeader()); + Assert.assertTrue(mockWriter.headerWritten); + } + + @Test + public void testClose() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.close(); + Assert.assertTrue(mockWriter.closed); + } + + @Test + public void testCloseWithoutClosingUnderlyingWriter() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.close(false); + Assert.assertFalse(mockWriter.closed); + } + + private VariantContext makeHomRef(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, ALLELES); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeHomRefAlt(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeNonRef(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeDeletion(final String contig, final int start, final int size) { + final String del = Utils.dupString("A", size); + final String alt = del.substring(0, 1); + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", contig, start, Arrays.asList(del, alt)); + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(vc.getReference(), vc.getAlternateAllele(0))); + gb.GQ(50); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + @Test + public void testCloseEmitsLastVariant() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + + writer.close(); + Assert.assertTrue(mockWriter.closed); + Assert.assertEquals(mockWriter.emitted.size(), 1); + } + + @Test + public void testCloseDoesntEmitsLastVariantWhenNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + + writer.close(); + Assert.assertTrue(mockWriter.closed); + Assert.assertEquals(mockWriter.emitted.size(), 1); + } + + @Test + public void testCrossingContigBoundaryRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("21", 3, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, false); + } + + @Test + public void testCrossingContigBoundaryToLowerPositionsRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 30, 30)); + writer.add(makeHomRef("20", 31, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("21", 10, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 30, 31, false); + writer.add(makeNonRef("21", 11, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(1), "21", 10, 10, false); + assertGoodVC(mockWriter.emitted.get(2), "21", 11, 11, true); + } + + @Test + public void testCrossingContigBoundaryFromNonRefToLowerPositionsRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 20, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + writer.add(makeHomRef("21", 10, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 20, 20, true); + writer.add(makeNonRef("21", 11, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(1), "21", 10, 10, false); + assertGoodVC(mockWriter.emitted.get(2), "21", 11, 11, true); + } + + @Test + public void testCrossingContigBoundaryNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeNonRef("21", 3, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, true); + } + + @Test + public void testCrossingContigBoundaryNonRefThenNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + writer.add(makeNonRef("21", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 1, true); + assertGoodVC(mockWriter.emitted.get(1), "21", 1, 1, true); + } + + private void assertGoodVC(final VariantContext vc, final String contig, final int start, final int stop, final boolean nonRef) { + Assert.assertEquals(vc.getChr(), contig); + Assert.assertEquals(vc.getStart(), start); + Assert.assertEquals(vc.getEnd(), stop); + if ( nonRef ) { + Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + } else { + Assert.assertEquals(vc.getNAlleles(), 2); + Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); + Assert.assertTrue(vc.hasGenotypes()); + Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME)); + Assert.assertEquals(vc.getGenotypes().size(), 1); + final Genotype g = vc.getGenotype(SAMPLE_NAME); + Assert.assertEquals(g.hasAD(), false); + Assert.assertEquals(g.hasLikelihoods(), true); + Assert.assertEquals(g.hasPL(), true); + Assert.assertEquals(g.getPL().length == 3, true); + Assert.assertEquals(g.hasDP(), true); + Assert.assertEquals(g.hasGQ(), true); + } + } + + @Test + public void testVariantForcesNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeNonRef("20", 3, 30)); + writer.add(makeHomRef("20", 4, 30)); + writer.add(makeHomRef("20", 5, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 3, true); + writer.close(); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 5, false); + } + + @Test + public void testEmittingTwoBands() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("20", 3, 50)); + writer.add(makeHomRef("20", 4, 50)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 4, false); + } + + @Test + public void testNonContiguousBlocks() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRef("20", 10, 0)); + writer.add(makeHomRef("20", 11, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 10, 11, false); + } + + @Test + public void testDeletion() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeDeletion("20", 3, 3)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 5, true); + assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); + } + + @Test + public void testHomRefAlt() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRefAlt("20", 3, 0)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 7, false); + } + + @DataProvider(name = "BandPartitionData") + public Object[][] makeBandPartitionData() { + List tests = new ArrayList<>(); + + tests.add(new Object[]{null, false}); + tests.add(new Object[]{Collections.emptyList(), false}); + tests.add(new Object[]{Arrays.asList(1), true}); + tests.add(new Object[]{Arrays.asList(1, 10), true}); + tests.add(new Object[]{Arrays.asList(1, 10, 30), true}); + tests.add(new Object[]{Arrays.asList(10, 1, 30), false}); + tests.add(new Object[]{Arrays.asList(-1, 1), false}); + tests.add(new Object[]{Arrays.asList(1, null, 10), false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BandPartitionData") + public void testMyData(final List partitions, final boolean expectedGood) { + try { + GVCFWriter.parsePartitions(partitions); + Assert.assertTrue(expectedGood, "Expected to fail but didn't"); + } catch ( Exception e ) { + Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java new file mode 100644 index 000000000..489eff0bc --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class NanoSchedulerIntegrationTest extends WalkerTest { + @DataProvider(name = "NanoSchedulerUGTest") + public Object[][] createNanoSchedulerUGTest() { + List tests = new ArrayList(); + + for ( final int nt : Arrays.asList(1, 2) ) + for ( final int nct : Arrays.asList(1, 2) ) { +// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); +//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); + tests.add(new Object[]{ "BOTH", "392dc99dc279082fc6e729b249adfa2b", nt, nct }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") + private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T UnifiedGenotyper -R " + b37KGReference, + "--no_cmdline_in_header -G", + //"--dbsnp " + b37dbSNP132, + "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", + "-L 20:10,000,000-10,100,000", + "-glm " + glm, + "--contamination_fraction_to_filter 0.0", + "-nt " + nt, + "-nct " + nct, + "-o %s" + ), + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); + } + + + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java new file mode 100644 index 000000000..1e5417227 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java @@ -0,0 +1,588 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResultSet; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Civar; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** +* Mock-up active region data used in testing. +* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +public class ActiveRegionTestDataSet { + + private final byte[] referenceBytes; + protected String reference; + protected String[] haplotypeCigars; + protected List haplotypeStrings; + protected String[] readCigars; + protected byte[] bq; + protected byte[] dq; + protected byte[] iq; + protected int kmerSize; + private List haplotypeList; + private List readList; + private AssemblyResultSet assemblyResultSet; + private Map readBySequence; + private String stringRepresentation; + private List> readEventOffsetList; + private GenomeLocParser genomeLocParser; + + /** Create a new active region data test set */ + public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, + final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { + this.reference = reference; + this.referenceBytes = reference.getBytes(); + this.haplotypeCigars = haplotypes; + this.readCigars = readCigars; + this.bq = bq; + this.dq = dq; + this.iq = iq; + this.kmerSize = kmerSize; + this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); + } + + public String getReference() { + return reference; + } + + public String toString() { + if (stringRepresentation == null) + return super.toString(); + else return stringRepresentation; + } + + public AssemblyResultSet assemblyResultSet() { + if (assemblyResultSet == null) { + final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); + rtg.addSequence("anonymous", this.getReference().getBytes(), true); + for (final String haplotype : this.haplotypesStrings()) { + rtg.addSequence("anonymous", haplotype.getBytes(), false); + } + rtg.buildGraphIfNecessary(); + if (rtg.hasCycles()) + throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); + + List haplotypeList = this.haplotypeList(); + + assemblyResultSet = new AssemblyResultSet(); + final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? + AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + + for (final Haplotype h : haplotypeList) + assemblyResultSet.add(h, ar); + } + return assemblyResultSet; + } + + public List haplotypesStrings() { + if (haplotypeStrings != null) { + return haplotypeStrings; + } + final List result = new ArrayList<>(haplotypeCigars.length); + String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllCombinations(cigar.substring(6),reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(applyCigar(reference, cigar,0,true)); + } else { + result.add(cigar); + } + } + haplotypeStrings = result; + return result; + } + + private List expandAllCombinations(final String cigarString, final String reference) { + final Civar civar = Civar.fromCharSequence(cigarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + result.add(c.applyTo(reference)); + } + return result; + } + + private List expandAllHaplotypeCombinations(final String civarString, final String reference) { + final Civar civar = Civar.fromCharSequence(civarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + final String baseString = c.applyTo(reference); + final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + try { + haplotype.setCigar(c.toCigar(reference.length())); + } catch (final RuntimeException ex) { + c.applyTo(reference); + c.toCigar(reference.length()); + throw new RuntimeException("" + c + " " + ex.getMessage(),ex); + } + result.add(haplotype); + } + return result; + } + + + public List haplotypeList() { + if (haplotypeList == null) { + + final List result = new ArrayList<>(haplotypeCigars.length); + final String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(cigarToHaplotype(reference, cigar, 0, true)); + } else { + final Haplotype h = new Haplotype(cigar.getBytes()); + h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + result.add(h); + } + } + haplotypeList = result; + } + return haplotypeList; + } + + + protected SAMSequenceDictionary artificialSAMSequenceDictionary() { + return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); + } + + protected SAMFileHeader artificialSAMFileHeader() { + return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); + } + + public List readList() { + if (readList == null) { + final SAMFileHeader header = artificialSAMFileHeader(); + readList = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + int count = 0; + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); + } else { + sequence = descr; + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } + count = readList.size(); + } + } + return readList; + } + + public List> readEventOffsetList() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + if (readEventOffsetList == null) { + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + final List unrolledCivars = civar.optionalizeAll().unroll(); + + readEventOffsetList = new ArrayList<>(readCigars.length); + int count = 0; + for (final String descr : readCigars) { + if (descr.matches("^\\d+:\\d+:.+$")) { + throw new UnsupportedOperationException(); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); + } else { + throw new UnsupportedOperationException(); + } + count = readEventOffsetList.size(); + } + readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); + } + return readEventOffsetList; + } + + + + + @SuppressWarnings("unused") + public String cigarToSequence(final String cigar) { + String reference = this.reference; + return applyCigar(reference, cigar,0,true); + } + + @SuppressWarnings("unused") + public GATKSAMRecord readFromString(final String readSequence) { + if (readBySequence == null) { + final List readList = readList(); + readBySequence = new HashMap<>(readList.size()); + for (final GATKSAMRecord r : readList) + readBySequence.put(r.getReadString(),r); + } + return readBySequence.get(readSequence); + } + + public List unrolledCivars() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + return civar.optionalizeAll().unroll(); + } + + public void introduceErrors(final Random rnd) { + final List reads = readList(); + final ArrayList result = new ArrayList<>(reads.size()); + for (final GATKSAMRecord read : reads) { + result.add(new MyGATKSAMRecord(read,rnd)); + } + readList = result; + } + + private class MyGATKSAMRecord extends GATKSAMRecord { + protected MyGATKSAMRecord(final GATKSAMRecord r) { + super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), + (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), + r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), + new byte[0]); + this.setReadBases(r.getReadBases()); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + } + + ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); + + public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { + super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), + (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), + r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), + new byte[0]); + final byte[] bases = new byte[r.getReadBases().length]; + + final byte[] readBases = r.getReadBases(); + final byte[] bq = r.getBaseQualities(); + final byte[] iq = r.getBaseInsertionQualities(); + final byte[] dq = r.getBaseDeletionQualities(); + int refOffset = r.getAlignmentStart() - 1; + int readOffset = 0; + for (int i = 0; i < r.getReadBases().length;) { + double p = rnd.nextDouble(); + double iqp = QualityUtils.qualToErrorProb(iq[i]); + if (p < iqp) { // insertion + final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); + final int refStart = rnd.nextInt(reference.length() - length); + System.arraycopy(referenceBytes,refStart,bases,i,length); + i += length; + continue; + } + p -= iqp; + double dqp = QualityUtils.qualToErrorProb(dq[i]); + if (p < dqp) { + final int length = generateIndelLength(rnd); + refOffset += length; + refOffset = refOffset % referenceBytes.length; + readOffset += length; + continue; + } + p -= dqp; + double bqp = QualityUtils.qualToErrorProb(bq[i]); + byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; + byte nb; + if (p < bqp) { + switch (b) { + case 'A': nb = 'C'; break; + case 'T': nb = 'A'; break; + case 'C': nb = 'G'; break; + case 'G': nb = 'B'; break; + default: nb = 'A'; + } + } else + nb = b; + + bases[i++] = nb; + refOffset++; + refOffset = refOffset % referenceBytes.length; + readOffset++; + } + this.setReadBases(bases); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + + + } + + private int generateIndelLength(final Random rnd) { + final int length; + try { + length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); + } catch (Exception e) { + throw new RuntimeException(e); + } + return length; + } + + @Override + public byte[] getBaseDeletionQualities() { + return Arrays.copyOf(dq,getReadLength()); + } + + @Override + public byte[] getBaseInsertionQualities() { + return Arrays.copyOf(iq,getReadLength()); + } + + @Override + public int getMappingQuality() { + return 100; + } + + @Override + public int hashCode() { + return getReadName().hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof GATKSAMRecord) { + return getReadName().equals(((GATKSAMRecord)o).getReadName()); + } else { + return false; + } + } + + public String toString() { + return super.toString() + " " + this.getReadString(); + } + } + + + public List readStrings() { + final List result = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + result.add(sequence); + } else if (descr.matches("\\*:^\\d+:\\d+")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + result.addAll(generateReads(haplotypes, readCount, readLength)); + } else { + sequence = descr; + result.add(sequence); + } + } + return result; + } + + private List generateReads(final List haplotypes, final int readCount, final int readLength) { + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = i % h.length() - readLength; + result.add(h.substring(offset,offset + readLength)); + } + return result; + } + + private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { + int id = idStart; + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + byte[] bases = h.substring(offset,to).getBytes(); + byte[] quals = Arrays.copyOf(bq,to - offset); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); + result.add(new MyGATKSAMRecord(samRecord)); + } + return result; + } + + + private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { + + final List> result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % unrolledCivars.size(); + final Civar c = unrolledCivars.get(hi); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + result.add(c.eventOffsets(reference,offset,to)); + } + return result; + } + + private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); + + + private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { + final String sequence = applyCigar(reference,cigar,offset,global); + final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); + return haplotype; + } + + private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { + final Matcher pm = cigarPattern.matcher(cigar); + StringBuffer sb = new StringBuffer(); + int index = offset; + while (pm.find()) { + int length = Integer.valueOf(pm.group(1)); + char operator = pm.group(2).charAt(0); + switch (operator) { + case '=' : + try { + sb.append(reference.substring(index, index + length)); + } catch (Exception e) { + throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); + } + index += length; break; + case 'D' : + index += length; break; + case 'I' : + String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); + sb.append(insert); break; + case 'V' : + sb.append(transversionV(reference.charAt(index))); index++; break; + case 'W' : + sb.append(transversionW(reference.charAt(index))); index++; break; + case 'T' : + sb.append(transition(reference.charAt(index))); index++; break; + default: + throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); + } + } + if (global && index != reference.length()) { + throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); + } else if (index > reference.length()) { + throw new RuntimeException(" index beyond end "); + } + return sb.toString(); + } + + protected int kmerSize() { + return kmerSize; + } + + private char transversionV(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'C'; + case 'G': return 'T'; + case 'C': return 'A'; + case 'T': return 'G'; + default: + return c; + } + + } + + private char transversionW(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'T'; + case 'G': return 'C'; + case 'T': return 'A'; + case 'C': return 'G'; + default: + return c; + } + + } + + private char transition(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'G'; + case 'G': return 'A'; + case 'T': return 'C'; + case 'C': return 'T'; + default: + return c; + } + + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModelUnitTest.java new file mode 100644 index 000000000..8c54326db --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModelUnitTest.java @@ -0,0 +1,337 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.QualityUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Iterator; + + +/** + * Unit tests for {@link PairHMMModel} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMModelUnitTest extends BaseTest { + + final double TOLERANCE = 1E-9; + + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbs(final int insQual, final int delQual, final int gcp, final double[] expected) { + final double[] actual = PairHMMModel.qualToTransProbs((byte)insQual,(byte)delQual,(byte)gcp); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length, PairHMMModel.TRANS_PROB_ARRAY_LENGTH); + assertEqualsDoubleArray(actual,expected,TOLERANCE); + Assert.assertEquals(actual.length, PairHMMModel.TRANS_PROB_ARRAY_LENGTH); + } + + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbsLog10(final int insQuals, final int delQual, final int gcp, final double[] expected) { + final double[] logExpected = new double[expected.length]; + for (int i = 0; i < logExpected.length; i++) + logExpected[i] = Math.log10(expected[i]); + final double[] actual = PairHMMModel.qualToTransProbsLog10((byte)insQuals,(byte)delQual,(byte)gcp); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length, PairHMMModel.TRANS_PROB_ARRAY_LENGTH); + assertEqualsDoubleArray(actual,logExpected,TOLERANCE); + } + + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbsFill(final int insQual, final int delQual, final int gcp, final double[] expected) { + final double[] actual = new double[PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + PairHMMModel.qualToTransProbs(actual, (byte) insQual, (byte) delQual, (byte) gcp); + assertEqualsDoubleArray(actual,expected,TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbs(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.qualToTransProbs(insQuals,delQuals,gapQuals); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,expected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],expected[i],TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbsLog10(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.qualToTransProbsLog10(insQuals,delQuals,gapQuals); + final double[][] logExpected = new double[expected.length][expected[0].length]; + for (int i = 1; i < expected.length; i++) + for (int j = 0; j < expected[0].length; j++) + logExpected[i][j] = Math.log10(expected[i][j]); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,logExpected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,logExpected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],logExpected[i],TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbsLog10Fill(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.createTransitionMatrix(insQuals.length); + PairHMMModel.qualToTransProbsLog10(actual,insQuals,delQuals,gapQuals); + final double[][] logExpected = new double[expected.length][expected[0].length]; + for (int i = 1; i < expected.length; i++) + for (int j = 0; j < expected[0].length; j++) + logExpected[i][j] = Math.log10(expected[i][j]); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,logExpected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,logExpected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],logExpected[i],TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbsFill(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.createTransitionMatrix(insQuals.length); + PairHMMModel.qualToTransProbs(actual,insQuals,delQuals,gapQuals); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,expected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],expected[i],TOLERANCE); + } + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbsLog10Fill(final int insQuals, final int delQual, final int gcp, final double[] expected) { + final double[] logExpected = new double[expected.length]; + for (int i = 0; i < logExpected.length; i++) + logExpected[i] = Math.log10(expected[i]); + final double[] actual = new double[PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + PairHMMModel.qualToTransProbsLog10(actual, (byte) insQuals, (byte) delQual, (byte) gcp); + assertEqualsDoubleArray(actual,logExpected,TOLERANCE); + } + + + @DataProvider(name="qualToTransDataProvider") + public Iterator qualToTransDataProvider() { + return new Iterator() { + + private final Iterator readLengthIterator = readLengthIterator(); + private Iterator qualsIterator = qualIterator(); + + @Override + public boolean hasNext() { + return readLengthIterator.hasNext(); + } + + @Override + public Object[] next() { + final int readLength = readLengthIterator.next(); + double[][] matrix = new double[readLength+1][PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + final byte[] insQuals = new byte[readLength]; + final byte[] delQuals = new byte[readLength]; + final byte[] gapQuals = new byte[readLength]; + for (int i = 0; i < readLength; i++) { + if (!qualsIterator.hasNext()) + qualsIterator = qualIterator(); + final int[] quals = qualsIterator.next(); + final int insQual = quals[0]; + final int delQual = quals[1]; + final int gapQual = quals[2]; + final double[] trans = qualsToProbs(insQual, delQual, gapQual); + matrix[i+1] = trans; + insQuals[i] = (byte)insQual; + delQuals[i] = (byte)delQual; + gapQuals[i] = (byte)gapQual; + } + + return new Object[] { insQuals, delQuals, gapQuals, matrix }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + @DataProvider(name="qualToProbsDataProvider") + public Iterator qualToProbsDataProvider() { + return new Iterator() { + private final Iterator qualsIterator = qualIterator(); + + @Override + public boolean hasNext() { + return qualsIterator.hasNext(); + } + + @Override + public Object[] next() { + final int[] quals = qualsIterator.next(); + final int insQual = quals[0]; + final int delQual = quals[1]; + final int gapQual = quals[2]; + + final double[] trans = qualsToProbs(insQual, delQual, gapQual); + + + return new Object[] { insQual, delQual, gapQual, trans }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + private double[] qualsToProbs(final int insQual, final int delQual, final int gapQual) { + final double[] trans = new double[PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + final double matchToMatch = PairHMMModel.matchToMatchProb(insQual, delQual); + final double matchToInsert = QualityUtils.qualToErrorProb(insQual); + final double matchToDeletion = QualityUtils.qualToErrorProb(delQual); + final double indelToMatch = QualityUtils.qualToProb(gapQual); + final double indelToIndel = QualityUtils.qualToErrorProb(gapQual); + + trans[PairHMMModel.matchToMatch] = matchToMatch; + trans[PairHMMModel.matchToInsertion] = matchToInsert; + trans[PairHMMModel.matchToDeletion] = matchToDeletion; + trans[PairHMMModel.indelToMatch] = indelToMatch; + trans[PairHMMModel.deletionToDeletion] = trans[PairHMMModel.insertionToInsertion] = indelToIndel; + return trans; + } + + private Iterator readLengthIterator() { + return Arrays.asList(READ_LENGTHS).iterator(); + } + + private Iterator qualIterator() { + final int totalCount = INS_QUALS.length * DEL_QUALS.length * GAP_QUALS.length; + + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < totalCount; + } + + @Override + public int[] next() { + final int gap = i % GAP_QUALS.length; + final int indelGroup = i / GAP_QUALS.length; + final int del = indelGroup % DEL_QUALS.length; + final int ins = indelGroup % DEL_QUALS.length; + i++; + return new int[] { INS_QUALS[ins], DEL_QUALS[del], GAP_QUALS[gap]}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + + @Test(dataProvider = "dualTestDataProvider") + public void testDoubleQualToProb(final int insQual, final int delQual, final double log10Expected, final double expected) { + Assert.assertEquals(PairHMMModel.matchToMatchProb(insQual, delQual),expected,TOLERANCE); + Assert.assertEquals(PairHMMModel.matchToMatchProbLog10(insQual, delQual),log10Expected,TOLERANCE); + Assert.assertEquals(PairHMMModel.matchToMatchProb((byte) insQual, (byte) delQual),expected,TOLERANCE); + Assert.assertEquals(PairHMMModel.matchToMatchProbLog10((byte) insQual, (byte) delQual),log10Expected,TOLERANCE); + } + + @DataProvider(name = "dualTestDataProvider") + private Iterator dualTestDataProvider() { + final int[] testQuals = new int[] { 0, 1, 2, 5, 10, 13, 17, 20, 23, 27, 30, 43, 57, 70, 100, 200, 254}; + + return new Iterator() { + private int i = 0; + private int j = 0; + + @Override + public Object[] next() { + + final int qual1 = testQuals[i]; + final int qual2 = testQuals[j]; + + final double errorProb1 = Math.pow(10,- 0.1 * qual1); + final double errorProb2 = Math.pow(10,- 0.1 * qual2); + final double expected = Math.max(0, (1 - (errorProb1 + errorProb2))); + final Object[] result = new Object[] { qual1, qual2,Math.log10(Math.min(1,expected)),Math.min(1, expected)}; + + if (++j >= testQuals.length) { + i++; + j = i; + } + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasNext() { + return i < testQuals.length; + } + }; + } + + + private static int[] INS_QUALS = {30, 45, 20, 10, 5, 60, 123 }; + + private static int[] DEL_QUALS = {30, 45, 20, 10, 5, 60, 123 }; + + private static int[] GAP_QUALS = {10, 20, 5}; + + private static Integer[] READ_LENGTHS = { 0, 1, 5, 20, 100, 250}; +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java new file mode 100644 index 000000000..69100bcdd --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java @@ -0,0 +1,86 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +/** + * Test for the Prob > 1 bug in PairHMM using callers. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMProbabilityBugIntegrationTest extends WalkerTest { + + private static final File REFERENCE = new File("/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta").getAbsoluteFile(); + private static final File BAM = new File ("private/testdata", "pairhmm_prob_bug.bam").getAbsoluteFile(); + private static final File INTERVAL = new File ("private/testdata", "pairhmm_prob_bug.interval.bed").getAbsoluteFile(); + + private static final File UG_BAM = new File("private/testdata", "pairhmm_prob_bug.ug.bam").getAbsoluteFile(); + private static final File UG_INTERVAL = new File("private/testdata", "pairhmm_prob_bug.ug.intervals.bed").getAbsoluteFile(); + + + @Test + public void testHaplotypeCaller() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s", + REFERENCE,BAM,INTERVAL); + final String name = getClass().getSimpleName() + ".testHaplotypeCaller"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } + + @Test + public void testUnifiedGenotyper() { + final String commandLine = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -dcov 200 -glm INDEL", + REFERENCE,UG_BAM,UG_INTERVAL); + final String name = getClass().getSimpleName() + ".testUnifiedGenotyper"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java new file mode 100644 index 000000000..7e2581c51 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java @@ -0,0 +1,117 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.ContextCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ContextCovariateUnitTest { + ContextCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ContextCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleContexts() { + GATKSAMRecord read = ReadUtils.createRandomRead(1000); + GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + + verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + } + + public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + for (int i = 0; i < values.length; i++) + Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); + + } + + public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { + final String bases = stringFrom(read.getReadBases()); + String expectedContext = null; + if (offset - contextSize + 1 >= 0) { + String context = bases.substring(offset - contextSize + 1, offset + 1); + if (!context.contains("N")) + expectedContext = context; + } + return expectedContext; + } + + private static String stringFrom(byte[] array) { + String s = ""; + for (byte value : array) + s += (char) value; + return s; + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java new file mode 100644 index 000000000..4f8a70cc9 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java @@ -0,0 +1,136 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class CycleCovariateUnitTest { + CycleCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new CycleCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleCycles() { + short readLength = 10; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); + + read.setReadNegativeStrandFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); + + read.setSecondOfPairFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); + + read.setReadNegativeStrandFlag(false); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); + } + + private void verifyCovariateArray(int[][] values, int init, int increment) { + for (short i = 0; i < values.length; i++) { + short actual = Short.decode(covariate.formatKey(values[i][0])); + int expected = init + (increment * i); + Assert.assertEquals(actual, expected); + } + } + + @Test(enabled = true, expectedExceptions={UserException.class}) + public void testMoreThanMaxCycleFails() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } + + @Test(enabled = true) + public void testMaxCyclePasses() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java new file mode 100644 index 000000000..eea8aa8f3 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java @@ -0,0 +1,143 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Random; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class ReadCovariatesUnitTest { + + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = false) + public void testCovariateGeneration() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final String RGID = "id"; + + ReadGroupCovariate rgCov = new ReadGroupCovariate(); + QualityScoreCovariate qsCov = new QualityScoreCovariate(); + ContextCovariate coCov = new ContextCovariate(); + CycleCovariate cyCov = new CycleCovariate(); + + rgCov.initialize(RAC); + qsCov.initialize(RAC); + coCov.initialize(RAC); + cyCov.initialize(RAC); + + Covariate[] requestedCovariates = new Covariate[4]; + requestedCovariates[0] = rgCov; + requestedCovariates[1] = qsCov; + requestedCovariates[2] = coCov; + requestedCovariates[3] = cyCov; + + final int NUM_READS = 100; + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + + final String[] readGroups = {"RG1", "RG2", "RGbla"}; + for (int idx = 0; idx < NUM_READS; idx++) { + for (final String rgs : readGroups) { + final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + read.setReadNegativeStrandFlag(rnd.nextBoolean()); + final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); + final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, length); + Assert.assertEquals(rc.getInsertionsKeySet().length, length); + Assert.assertEquals(rc.getDeletionsKeySet().length, length); + + for (int i = 0; i < length; i++) { + // check that read group is always the same + Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); + + // check quality score + Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); + + // check context + Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + + // check cycle + Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); + } + + } + + } + + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java new file mode 100644 index 000000000..a8366ce5c --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java @@ -0,0 +1,121 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.ReadGroupCovariate; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ReadGroupCovariateUnitTest { + ReadGroupCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ReadGroupCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSingleRecord() { + final String expected = "SAMPLE.1"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setPlatformUnit(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testMissingPlatformUnit() { + final String expected = "MY.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testForceReadgroup() { + final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); + forcedRAC.FORCE_READGROUP = "FOO"; + final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); + forcedCovariate.initialize(forcedRAC); + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); + runTest(rg, "FOO", forcedCovariate); + } + + private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { + GATKSAMRecord read = ReadUtils.createRandomRead(10); + read.setReadGroup(rg); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); + + } + + private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { + for (int[] value : values) { + String actual = covariate.formatKey(value[0]); + Assert.assertEquals(actual, expected); + } + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java new file mode 100644 index 000000000..d3c3ffe97 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -0,0 +1,171 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class RecalibrationReportUnitTest { + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); + final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); + return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); + } + + @Test + public void testOutput() { + final int length = 100; + + List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + + for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { + quals.add((byte) i); + counts.add(1L); + } + + final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + quantizationInfo.noQuantization(); + final List requiredCovariates = new LinkedList(); + final List optionalCovariates = new LinkedList(); + + final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + + final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + + final ContextCovariate cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + final CycleCovariate cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + + final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + int covariateIndex = 0; + for (final Covariate cov : requiredCovariates) + requestedCovariates[covariateIndex++] = cov; + for (final Covariate cov : optionalCovariates) + requestedCovariates[covariateIndex++] = cov; + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); + rg.setPlatform("illumina"); + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + read.setReadGroup(rg); + final byte [] readQuals = new byte[length]; + for (int i = 0; i < length; i++) + readQuals[i] = 20; + read.setBaseQualities(readQuals); + + final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + int nKeys = 0; // keep track of how many keys were produced + final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); + final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + + for (int offset = 0; offset < length; offset++) { + + for (EventType errorMode : EventType.values()) { + + final int[] covariates = rc.getKeySet(offset, errorMode); + final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; + + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); + nKeys += 2; + for (int j = 0; j < optionalCovariates.size(); j++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); + final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; + if ( covValue >= 0 ) { + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); + nKeys++; + } + } + } + } + Assert.assertEquals(nKeys, expectedKeys); + } + + private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { + final int numCovariates = 4; + final int numTables = 3; + final int mismatchContextPadding = mismatchesContextSize - 1; + final int indelContextPadding = 2 * (indelContextSize - 1); + final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); + + return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java new file mode 100644 index 000000000..74cb2a1eb --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java @@ -0,0 +1,245 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +public class RepeatCovariatesUnitTest { + + RepeatLengthCovariate rlCovariate; + RepeatUnitCovariate ruCovariate; + RepeatUnitAndLengthCovariate rurlCovariate; + RecalibrationArgumentCollection RAC; + + + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + rlCovariate = new RepeatLengthCovariate(); + ruCovariate = new RepeatUnitCovariate(); + rurlCovariate = new RepeatUnitAndLengthCovariate(); + rlCovariate.initialize(RAC); + ruCovariate.initialize(RAC); + rurlCovariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + + @Test + public void testFindNumberOfRepetitions() { + // First, test logic to compute number of repetitions of a substring on a given string. + int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); + Assert.assertEquals(1,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); + Assert.assertEquals(0,result); + // Same tests but looking backward on string + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); + Assert.assertEquals(3,result); + + // test logic to get repeat unit and number of repeats from covariate value + final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; + for (String ru : repUnits) { + for (int k=1; k < 10; k++) { + Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); + Assert.assertEquals(pair.second.intValue(),k); + Assert.assertEquals(pair.first,ru); + } + } + + } + + /** + * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if + * they match with read context + */ + @Test + public void testManyObservations() { + final int NUM_UNITS = 10; + final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; + final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; + final int NUM_TEST_CASES = 100; + + Random random = new Random(); + + for (int r = 0; r < NUM_TEST_CASES; r++) { + final StringBuilder sb = new StringBuilder(); + // for each unit, generate a repeat unit at random with given random length + final ArrayList repeatUnits = new ArrayList(); + final ArrayList numsRepetitions = new ArrayList(); + for (int n=0; n < NUM_UNITS; n++) { + final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); + final String repeatUnit = getRandomBases(repLength); + final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); + + // log for comparison with covariate + numsRepetitions.add(numRepetitions); + repeatUnits.add(repeatUnit); + + for (int k=0; k < numRepetitions; k++) + sb.append(repeatUnit); + + } + + final String readBases = sb.toString(); + System.out.println(readBases); + final int readLength = readBases.length(); + + final byte[] readQuals = new byte[readLength]; + Arrays.fill(readQuals,(byte)30); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); + + Covariate[] requestedCovariates = new Covariate[3]; + requestedCovariates[0] = rlCovariate; + requestedCovariates[1] = ruCovariate; + requestedCovariates[2] = rurlCovariate; + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); + Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); + Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); + + for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read + // check RepeatLength + final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); + final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); + final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); + // check RepeatUnit + final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); + final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); + final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); + // check RepeatUnitAndLength + final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); + final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); + final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); + // check all 3 values are identical + Assert.assertEquals(rlValD,rlValI); + Assert.assertEquals(rlValM,rlValI); + Assert.assertEquals(ruValD,ruValI); + Assert.assertEquals(ruValM,ruValI); + Assert.assertEquals(rurlValD,rurlValI); + Assert.assertEquals(rurlValM,rurlValI); + + + int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); + int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); + Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); + } + + } + + + + + + + } + + /** + * Returns random bases of given length + * @param length required length + * @return given random string + */ + @Requires("length > 0") + String getRandomBases(final int length) { + byte[] bases = new byte[length]; + Random ran = new Random(); + for (int i=0; i < length; i++ ) { + int idx = ran.nextInt(4); + bases[i] = BaseUtils.baseIndexToSimpleBase(idx); + } + return new String(bases); + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java deleted file mode 100644 index 5c48417ac..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java +++ /dev/null @@ -1,120 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Total (unfiltered) depth over all samples. - * - *

While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's - * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth - * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for - * N samples with -dcov D is N * D - *

- */ -public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map perReadAlleleLikelihoodMap ) { - - int depth = 0; - if (stratifiedContexts != null) { - if ( stratifiedContexts.size() == 0 ) - return null; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().getBasePileup().depthOfCoverage(); - } - else if (perReadAlleleLikelihoodMap != null) { - if ( perReadAlleleLikelihoodMap.size() == 0 ) - return null; - - for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { - for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final GATKSAMRecord read = el.getKey(); - depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); - } - } - } - else - return null; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%d", depth)); - return map; - } - - public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } - - public List getDescriptions() { - return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0))); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java deleted file mode 100644 index 0da865a85..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ /dev/null @@ -1,166 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; -import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - - -/** - * The depth of coverage of each allele per sample - * - *

The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this - * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the - * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of - * REF and ALT fields) is the unfiltered count of all reads that carried with them the - * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the - * power I have to determine the genotype of the sample at this site, while the AD tells me how many times - * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering - * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like - * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would - * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that - * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. - * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are - * many non-informative reads.

- * - *

Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation, - * one should not base assumptions about the underlying genotype based on it; - * instead, the genotype likelihoods (PLs) are what determine the genotype calls.

- * - */ -public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { - - public void annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final AlignmentContext stratifiedContext, - final VariantContext vc, - final Genotype g, - final GenotypeBuilder gb, - final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) - return; - - if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) - annotateWithLikelihoods(alleleLikelihoodMap, vc, gb); - else if ( stratifiedContext != null && (vc.isSNP())) - annotateWithPileup(stratifiedContext, vc, gb); - } - - private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { - - final HashMap alleleCounts = new HashMap<>(); - for ( final Allele allele : vc.getAlleles() ) - alleleCounts.put(allele.getBases()[0], 0); - - final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - for ( final PileupElement p : pileup ) { - if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); - } - - // we need to add counts in the correct order - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); - - gb.AD(counts); - } - - private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) { - final Set alleles = new HashSet<>(vc.getAlleles()); - - // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) - throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); - - final HashMap alleleCounts = new HashMap<>(); - for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - - for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); - if (! a.isInformative() ) continue; // read is non-informative - final GATKSAMRecord read = el.getKey(); - final int prevCount = alleleCounts.get(a.getMostLikelyAllele()); - final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - alleleCounts.put(a.getMostLikelyAllele(), prevCount + incCount); - } - - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(vc.getReference()); - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); - - gb.AD(counts); - } - - public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } - - public List getDescriptions() { - return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0))); - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java deleted file mode 100644 index 21325e6f1..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; -import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; - -import java.util.*; - - -/** - * The depth of coverage for informative reads for each sample. - * - * An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a - * case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear - * whether the read contains the reference allele or e.g. an extra repeat. - * The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can - * only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator). - * This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field - * (i.e., sum(AD) = DP). - */ -public class DepthPerSampleHC extends GenotypeAnnotation { - public void annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final AlignmentContext stratifiedContext, - final VariantContext vc, - final Genotype g, - final GenotypeBuilder gb, - final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) - return; - - if (alleleLikelihoodMap == null ) - throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller"); - - // the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot - // differentiate between reads that align over the event but aren't informative vs. those that aren't even - // close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP). - int dp = 0; - - if ( alleleLikelihoodMap.isEmpty() ) { - // there are no reads - } else { - final Set alleles = new HashSet<>(vc.getAlleles()); - - // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) - throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet()); - - for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); - if ( a.isInformative() ) { - final GATKSAMRecord read = el.getKey(); - final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - dp += incCount; - } - } - - gb.DP(dp); - } - } - - public List getKeyNames() { - return Collections.singletonList(VCFConstants.DEPTH_KEY); - } - - public List getDescriptions() { - return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY)); - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java deleted file mode 100644 index 95be967a2..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ /dev/null @@ -1,457 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import cern.jet.math.Arithmetic; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - - -/** - * Phred-scaled p-value using Fisher's Exact Test to detect strand bias - * - *

Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation - * being seen on only the forward or only the reverse strand) in the reads. More bias is - * indicative of false positive calls. - *

- * - *

Caveat

- *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

- */ -public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - private final static boolean ENABLE_DEBUGGING = false; - private final static Logger logger = Logger.getLogger(FisherStrand.class); - - private static final String FS = "FS"; - private static final double MIN_PVALUE = 1E-320; - private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( !vc.isVariant() ) - return null; - - if ( vc.hasGenotypes() ) { - final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); - if ( tableFromPerSampleAnnotations != null ) { - return pValueForBestTable(tableFromPerSampleAnnotations, null); - } - } - - if (vc.isSNP() && stratifiedContexts != null) { - final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); - final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); - printTable("unfiltered", tableNoFiltering); - printTable("filtered", tableFiltering); - return pValueForBestTable(tableFiltering, tableNoFiltering); - } - else if (stratifiedPerReadAlleleLikelihoodMap != null) { - // either SNP with no alignment context, or indels: per-read likelihood map needed - final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); -// logger.info("VC " + vc); -// printTable(table, 0.0); - return pValueForBestTable(table, null); - } - else - // for non-snp variants, we need per-read likelihoods. - // for snps, we can get same result from simple pileup - return null; - } - - /** - * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together - * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation - * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation - */ - private int[][] getTableFromSamples( final GenotypesContext genotypes ) { - if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } - - final int[] sbArray = {0,0,0,0}; // forward-reverse -by- alternate-reference - boolean foundData = false; - - for( final Genotype g : genotypes ) { - if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) - continue; - - foundData = true; - final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); - final int[] data = encodeSBBS(sbbsString); - for( int index = 0; index < sbArray.length; index++ ) { - sbArray[index] += data[index]; - } - } - - return ( foundData ? decodeSBBS(sbArray) : null ); - } - - /** - * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 - * - * @param table1 a contingency table, may be null - * @param table2 a contingency table, may be null - * @return annotation result for FS given tables - */ - private Map pValueForBestTable(final int[][] table1, final int[][] table2) { - if ( table2 == null ) - return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1)); - else if (table1 == null) - return annotationForOneTable(pValueForContingencyTable(table2)); - else { // take the one with the best (i.e., least significant pvalue) - double pvalue1 = pValueForContingencyTable(table1); - double pvalue2 = pValueForContingencyTable(table2); - return annotationForOneTable(Math.max(pvalue1, pvalue2)); - } - } - - /** - * Returns an annotation result given a pValue - * - * @param pValue - * @return a hash map from FS -> phred-scaled pValue - */ - private Map annotationForOneTable(final double pValue) { - final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs - return Collections.singletonMap(FS, value); - } - - public List getKeyNames() { - return Collections.singletonList(FS); - } - - public List getDescriptions() { - return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); - } - - /** - * Helper function to turn the FisherStrand table into the SB annotation array - * @param table the table used by the FisherStrand annotation - * @return the array used by the per-sample Strand Bias annotation - */ - public static int[] getContingencyArray( final int[][] table ) { - if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } - if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } - final int[] array = new int[4]; // TODO - if we ever want to do something clever with multi-allelic sites this will need to change - array[0] = table[0][0]; - array[1] = table[0][1]; - array[2] = table[1][0]; - array[3] = table[1][1]; - return array; - } - - /** - * Helper function to parse the genotype annotation into the SB annotation array - * @param string the string that is returned by genotype.getAnnotation("SB") - * @return the array used by the per-sample Strand Bias annotation - */ - private static int[] encodeSBBS( final String string ) { - final int[] array = new int[4]; - final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); - for( int index = 0; index < 4; index++ ) { - array[index] = Integer.parseInt(tokenizer.nextToken()); - } - return array; - } - - /** - * Helper function to turn the SB annotation array into the FisherStrand table - * @param array the array used by the per-sample Strand Bias annotation - * @return the table used by the FisherStrand annotation - */ - private static int[][] decodeSBBS( final int[] array ) { - if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } - final int[][] table = new int[2][2]; - table[0][0] = array[0]; - table[0][1] = array[1]; - table[1][0] = array[2]; - table[1][1] = array[3]; - return table; - } - - private Double pValueForContingencyTable(int[][] originalTable) { - int [][] table = copyContingencyTable(originalTable); - - double pCutoff = computePValue(table); - //printTable(table, pCutoff); - - double pValue = pCutoff; - while (rotateTable(table)) { - double pValuePiece = computePValue(table); - - //printTable(table, pValuePiece); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - table = copyContingencyTable(originalTable); - while (unrotateTable(table)) { - double pValuePiece = computePValue(table); - - //printTable(table, pValuePiece); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - //System.out.printf("P-cutoff: %f\n", pCutoff); - //System.out.printf("P-value: %f\n\n", pValue); - - // min is necessary as numerical precision can result in pValue being slightly greater than 1.0 - return Math.min(pValue, 1.0); - } - - private static int [][] copyContingencyTable(int [][] t) { - int[][] c = new int[2][2]; - - for ( int i = 0; i < 2; i++ ) - for ( int j = 0; j < 2; j++ ) - c[i][j] = t[i][j]; - - return c; - } - - - private static void printTable(int[][] table, double pValue) { - logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); - } - - /** - * Printing information to logger.info for debugging purposes - * - * @param name the name of the table - * @param table the table itself - */ - private void printTable(final String name, final int[][] table) { - if ( ENABLE_DEBUGGING ) { - final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); - logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", - name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); - } - } - - private static boolean rotateTable(int[][] table) { - table[0][0] -= 1; - table[1][0] += 1; - - table[0][1] += 1; - table[1][1] -= 1; - - return (table[0][0] >= 0 && table[1][1] >= 0); - } - - private static boolean unrotateTable(int[][] table) { - table[0][0] += 1; - table[1][0] -= 1; - - table[0][1] -= 1; - table[1][1] += 1; - - return (table[0][1] >= 0 && table[1][0] >= 0); - } - - private static double computePValue(int[][] table) { - - int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; - int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; - int N = rowSums[0] + rowSums[1]; - - // calculate in log space so we don't die with high numbers - double pCutoff = Arithmetic.logFactorial(rowSums[0]) - + Arithmetic.logFactorial(rowSums[1]) - + Arithmetic.logFactorial(colSums[0]) - + Arithmetic.logFactorial(colSums[1]) - - Arithmetic.logFactorial(table[0][0]) - - Arithmetic.logFactorial(table[0][1]) - - Arithmetic.logFactorial(table[1][0]) - - Arithmetic.logFactorial(table[1][1]) - - Arithmetic.logFactorial(N); - return Math.exp(pCutoff); - } - - private static int sumRow(int[][] table, int column) { - int sum = 0; - for (int r = 0; r < table.length; r++) { - sum += table[r][column]; - } - - return sum; - } - - private static int sumColumn(int[][] table, int row) { - int sum = 0; - for (int c = 0; c < table[row].length; c++) { - sum += table[row][c]; - } - - return sum; - } - - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { - if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } - if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } - - final Allele ref = vc.getReference(); - final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); - final int[][] table = new int[2][2]; - - for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - final GATKSAMRecord read = el.getKey(); - final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); - } - } - - return table; - } - - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - private static int[][] getSNPContingencyTable(final Map stratifiedContexts, - final Allele ref, - final Allele alt, - final int minQScoreToConsider ) { - int[][] table = new int[2][2]; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - for (PileupElement p : sample.getValue().getBasePileup()) { - - if ( ! isUsableBase(p) ) // ignore deletions and bad MQ - continue; - - if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) - continue; - - updateTable(table, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount()); - } - } - - return table; - } - - /** - * Can the base in this pileup element be used in comparative tests? - * - * @param p the pileup element to consider - * - * @return true if this base is part of a meaningful read for comparison, false otherwise - */ - private static boolean isUsableBase(final PileupElement p) { - return !( p.isDeletion() || - p.getMappingQual() == 0 || - p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); - } - - private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { - - final boolean matchesRef = allele.equals(ref, true); - final boolean matchesAlt = allele.equals(alt, true); - - if ( matchesRef || matchesAlt ) { - final int row = matchesRef ? 0 : 1; - - if ( read.isStrandless() ) { - - // ignore strandless reduced reads because they are always on the forward strand! - if ( !read.isReducedRead() ) { - - // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 - // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even - // if the read is only seen once, because it's a merged read or other) - final int toAdd = Math.max(representativeCount / 2, 1); - table[row][0] += toAdd; - table[row][1] += toAdd; - } - } else { - // a normal read with an actual strand - final boolean isFW = !read.getReadNegativeStrandFlag(); - final int column = isFW ? 0 : 1; - table[row][column] += representativeCount; - } - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java deleted file mode 100644 index 906cfa021..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ /dev/null @@ -1,157 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverage; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - -/** - * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. - * - * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing - * reads associated with the samples with polymorphic genotypes. - */ -public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { -// private final static Logger logger = Logger.getLogger(QualByDepth.class); - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map perReadAlleleLikelihoodMap ) { - if ( !vc.hasLog10PError() ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int depth = 0; - - for ( final Genotype genotype : genotypes ) { - - // we care only about variant calls with likelihoods - if ( !genotype.isHet() && !genotype.isHomVar() ) - continue; - - if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - depth += context.getBasePileup().depthOfCoverage(); - - } else if (perReadAlleleLikelihoodMap != null) { - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) - continue; - - depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); - } else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed - depth += genotype.getDP(); - } - } - - if ( depth == 0 ) - return null; - - final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); - double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); - QD = fixTooHighQD(QD); - Map map = new HashMap<>(); - map.put(getKeyNames().get(0), String.format("%.2f", QD)); - return map; - } - - /** - * The haplotype caller generates very high quality scores when multiple events are on the - * same haplotype. This causes some very good variants to have unusually high QD values, - * and VQSR will filter these out. This code looks at the QD value, and if it is above - * threshold we map it down to the mean high QD value, with some jittering - * - * // TODO -- remove me when HaplotypeCaller bubble caller is live - * - * @param QD the raw QD score - * @return a QD value - */ - private double fixTooHighQD(final double QD) { - if ( QD < MAX_QD_BEFORE_FIXING ) { - return QD; - } else { - return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; - } - } - - private final static double MAX_QD_BEFORE_FIXING = 35; - private final static double IDEAL_HIGH_QD = 30; - private final static double JITTER_SIGMA = 3; - - public List getKeyNames() { return Arrays.asList("QD"); } - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java deleted file mode 100644 index d9bc5966c..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ /dev/null @@ -1,123 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - - -/** - * Root Mean Square of the mapping quality of the reads across all samples. - */ -public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map perReadAlleleLikelihoodMap ) { - - final List qualities = new ArrayList<>(); - if ( stratifiedContexts != null ) { - if ( stratifiedContexts.size() == 0 ) - return null; - - for ( final Map.Entry sample : stratifiedContexts.entrySet() ) { - final AlignmentContext context = sample.getValue(); - for ( final PileupElement p : context.getBasePileup() ) - fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), p.getRepresentativeCount(), qualities); - } - } - else if (perReadAlleleLikelihoodMap != null) { - if ( perReadAlleleLikelihoodMap.size() == 0 ) - return null; - - for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { - for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() ) - fillMappingQualitiesFromPileup(read.getMappingQuality(), (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1), qualities); - } - } - else - return null; - - final double rms = MathUtils.rms(qualities); - return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms)); - } - - private static void fillMappingQualitiesFromPileup(final int mq, final int representativeCount, final List qualities) { - if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) { - if ( representativeCount == 1 ) - qualities.add(mq); - else - qualities.addAll(Collections.nCopies(representativeCount, mq)); - } - } - - public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } - - public List getDescriptions() { - return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0))); - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java deleted file mode 100644 index ab5a40145..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ /dev/null @@ -1,266 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.MannWhitneyU; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - - -/** - * Abstract root for all RankSum based annotations - */ -public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { - static final boolean DEBUG = false; - private boolean useDithering = true; - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null - - final GenotypesContext genotypes = vc.getGenotypes(); - if (genotypes == null || genotypes.size() == 0) - return null; - - final ArrayList refQuals = new ArrayList<>(); - final ArrayList altQuals = new ArrayList<>(); - - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { - - boolean usePileup = true; - - if ( stratifiedPerReadAlleleLikelihoodMap != null ) { - final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) { - fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, refQuals, altQuals); - usePileup = false; - } - } - - // the old UG SNP-only path through the annotations - if ( usePileup && stratifiedContexts != null ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context != null ) { - final ReadBackedPileup pileup = context.getBasePileup(); - if ( pileup != null ) - fillQualsFromPileup(vc.getAlleles(), pileup, refQuals, altQuals); - } - } - } - - if ( refQuals.isEmpty() && altQuals.isEmpty() ) - return null; - - final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); - for (final Double qual : altQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); - } - for (final Double qual : refQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); - } - - if (DEBUG) { - System.out.format("%s, REF QUALS:", this.getClass().getName()); - for (final Double qual : refQuals) - System.out.format("%4.1f ", qual); - System.out.println(); - System.out.format("%s, ALT QUALS:", this.getClass().getName()); - for (final Double qual : altQuals) - System.out.format("%4.1f ", qual); - System.out.println(); - - } - // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); - - final Map map = new HashMap<>(); - if (!Double.isNaN(testResults.first)) - map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); - return map; - } - - private void fillQualsFromPileup(final List alleles, - final ReadBackedPileup pileup, - final List refQuals, - final List altQuals) { - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - final Double value = getElementForPileupElement(p); - if ( value == null ) - continue; - - if ( alleles.get(0).equals(Allele.create(p.getBase(), true)) ) - refQuals.add(value); - else if ( alleles.contains(Allele.create(p.getBase())) ) - altQuals.add(value); - } - } - } - - private void fillQualsFromLikelihoodMap(final List alleles, - final int refLoc, - final PerReadAlleleLikelihoodMap likelihoodMap, - final List refQuals, - final List altQuals) { - for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if ( ! a.isInformative() ) - continue; // read is non-informative - - final GATKSAMRecord read = el.getKey(); - if ( isUsableRead(read, refLoc) ) { - final Double value = getElementForRead(read, refLoc, a); - if ( value == null ) - continue; - - if ( a.getMostLikelyAllele().isReference() ) - refQuals.add(value); - else if ( alleles.contains(a.getMostLikelyAllele()) ) - altQuals.add(value); - } - } - } - - /** - * Get the element for the given read at the given reference position - * - * @param read the read - * @param refLoc the reference position - * @param mostLikelyAllele the most likely allele for this read - * @return a Double representing the element to be used in the rank sum test, or null if it should not be used - */ - protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) { - return getElementForRead(read, refLoc); - } - - /** - * Get the element for the given read at the given reference position - * - * @param read the read - * @param refLoc the reference position - * @return a Double representing the element to be used in the rank sum test, or null if it should not be used - */ - protected abstract Double getElementForRead(final GATKSAMRecord read, final int refLoc); - - // TODO -- until the ReadPosRankSumTest stops treating these differently, we need to have separate methods for GATKSAMRecords and PileupElements. Yuck. - - /** - * Get the element for the given read at the given reference position - * - * By default this function returns null, indicating that the test doesn't support the old style of pileup calculations - * - * @param p the pileup element - * @return a Double representing the element to be used in the rank sum test, or null if it should not be used - */ - protected Double getElementForPileupElement(final PileupElement p) { - // does not work in pileup mode - return null; - } - - /** - * Can the base in this pileup element be used in comparative tests between ref / alt bases? - * - * Note that this function by default does not allow deletion pileup elements - * - * @param p the pileup element to consider - * @return true if this base is part of a meaningful read for comparison, false otherwise - */ - protected boolean isUsableBase(final PileupElement p) { - return !(p.isDeletion() || - p.getMappingQual() == 0 || - p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE || // need the unBAQed quality score here - p.getRead().isReducedRead() ); - } - - /** - * Can the read be used in comparative tests between ref / alt bases? - * - * @param read the read to consider - * @param refLoc the reference location - * @return true if this read is meaningful for comparison, false otherwise - */ - protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) { - return !( read.getMappingQuality() == 0 || - read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - read.isReducedRead() ); - } - - /** - * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if - * engine randomization is turned off, and if so does not dither. - * @param walker the walker - * @param toolkit the GATK engine - * @param headerLines the header lines - */ - public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { - useDithering = ! toolkit.getArguments().disableDithering; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java deleted file mode 100644 index dd57c8ac6..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - - -/** - * Fraction of reads containing spanning deletions at this site - * - *

Note that this annotation is currently not compatible with HaplotypeCaller.

- */ -public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) - return null; - - // not meaningful when we're at an indel location: deletions that start at location N are by definition called at the position N-1, and at position N-1 - // there are no informative deletions in the pileup - if (!vc.isSNP()) - return null; - - int deletions = 0; - int depth = 0; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - for ( final PileupElement p : sample.getValue().getBasePileup() ) { - final int actualSampleDepth = p.getRepresentativeCount(); - depth += actualSampleDepth; - if ( p.isDeletion() ) - deletions += actualSampleDepth; - } - } - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); - return map; - } - - public List getKeyNames() { return Arrays.asList("Dels"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Dels", 1, VCFHeaderLineType.Float, "Fraction of Reads Containing Spanning Deletions")); } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java deleted file mode 100644 index fde344e9f..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java +++ /dev/null @@ -1,100 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; - -import java.util.*; - -/** - * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias - * User: rpoplin - * Date: 8/28/13 - */ - -public class StrandBiasBySample extends GenotypeAnnotation implements ExperimentalAnnotation { - - public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; - - @Override - public void annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final AlignmentContext stratifiedContext, - final VariantContext vc, - final Genotype g, - final GenotypeBuilder gb, - final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) - return; - - if (alleleLikelihoodMap == null ) - throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller"); - - final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); - - gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); - } - - @Override - public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } - - @Override - public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java deleted file mode 100644 index 3882b70fa..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ /dev/null @@ -1,539 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.CigarElement; -import net.sf.samtools.SAMFileHeader; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.recalibration.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). - * - *

- * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating - * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative - * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, - * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical - * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. - * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). - *

- * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. - * - *

- * - *

Input

- *

- * The input read data whose base quality scores need to be assessed. - *

- * A database of known polymorphic sites to skip over. - *

- * - *

Output

- *

- * A GATK Report file with many tables: - *

    - *
  1. The list of arguments
  2. - *
  3. The quantized qualities table
  4. - *
  5. The recalibration table by read group
  6. - *
  7. The recalibration table by quality score
  8. - *
  9. The recalibration table for all the optional covariates
  10. - *
- * - * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. - *

- * - *

Examples

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T BaseRecalibrator \
- *   -I my_reads.bam \
- *   -R resources/Homo_sapiens_assembly18.fasta \
- *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
- *   -knownSites another/optional/setOfSitesToMask.vcf \
- *   -o recal_data.table
- * 
- */ - -@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class}) -@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) -@PartitionBy(PartitionType.READ) -public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { - /** - * all the command line arguments for BQSR and it's covariates - */ - @ArgumentCollection - private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - /** - * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency - * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag - * to safely access only one table. There may be some CPU cost, but as long as the table is really big - * there should be relatively little CPU costs. - */ - @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) - public boolean lowMemoryMode = false; - - @Advanced - @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) - public double BAQGOP = BAQ.DEFAULT_GOP; - - /** - * an object that keeps track of the information necessary for quality score quantization - */ - private QuantizationInfo quantizationInfo; - - /** - * list to hold the all the covariate objects that were requested (required + standard + experimental) - */ - private Covariate[] requestedCovariates; - - private RecalibrationEngine recalibrationEngine; - - private int minimumQToUse; - - private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; - - private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector - private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation - private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; - - /** - * Parse the -cov arguments and create a list of covariates to be used here - * Based on the covariates' estimates for initial capacity allocate the data hashmap - */ - public void initialize() { - baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty - - if (RAC.FORCE_PLATFORM != null) - RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; - - if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified - throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION); - - if (RAC.LIST_ONLY) { - RecalUtils.listAvailableCovariates(logger); - System.exit(0); - } - RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table - - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates - ArrayList requiredCovariates = covariates.getFirst(); - ArrayList optionalCovariates = covariates.getSecond(); - - requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - int covariateIndex = 0; - for (final Covariate covariate : requiredCovariates) - requestedCovariates[covariateIndex++] = covariate; - for (final Covariate covariate : optionalCovariates) - requestedCovariates[covariateIndex++] = covariate; - - logger.info("The covariates being used here: "); - for (Covariate cov : requestedCovariates) { // list all the covariates being used - logger.info("\t" + cov.getClass().getSimpleName()); - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - } - - try { - RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); - } - - initializeRecalibrationEngine(); - RecalUtils.checkForInvalidRecalBams(getToolkit().getSAMFileHeaders(), getToolkit().getArguments().ALLOW_BQSR_ON_REDUCED_BAMS); - minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; - referenceReader = getToolkit().getReferenceDataSource().getReference(); - } - - /** - * Initialize the recalibration engine - */ - private void initializeRecalibrationEngine() { - int numReadGroups = 0; - for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) - numReadGroups += header.getReadGroups().size(); - - recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode); - } - - private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { - return read.getBaseQualities()[offset] < minimumQToUse; - } - - /** - * For each read at this locus get the various covariate values and increment that location in the map based on - * whether or not the base matches the reference at this particular location - */ - public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { - - final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) ); - if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it - - RecalUtils.parsePlatformForRead(read, RAC); - if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls - return 0L; // skip this read completely - } - - final int[] isSNP = calculateIsSNP(read, ref, originalRead); - final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); - final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); - final int nErrors = nEvents(isSNP, isInsertion, isDeletion); - - // note for efficiency regions we don't compute the BAQ array unless we actually have - // some error to marginalize over. For ILMN data ~85% of reads have no error - final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read); - - if( baqArray != null ) { // some reads just can't be BAQ'ed - final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates); - final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases - final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); - final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); - final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); - - // aggregate all of the info into our info object, and update the data - final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors); - recalibrationEngine.updateDataForRead(info); - return 1L; - } else { - return 0L; - } - } - - /** - * Compute the number of mutational events across all hasEvent vectors - * - * Simply the sum of entries in hasEvents - * - * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event) - * @return the total number of events across all hasEvent arrays - */ - protected static int nEvents(final int[]... hasEvents) { - int n = 0; - for ( final int[] hasEvent : hasEvents ) { - n += MathUtils.sum(hasEvent); - } - return n; - } - - protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { - final byte[] bases = read.getReadBases(); - final boolean[] skip = new boolean[bases.length]; - final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites)); - for( int iii = 0; iii < bases.length; iii++ ) { - skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii); - } - return skip; - } - - protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) { - return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset); - } - - protected static boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { - final int readLength = read.getReadBases().length; - final boolean[] knownSites = new boolean[readLength]; - Arrays.fill(knownSites, false); - for( final Feature f : features ) { - int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? - if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - featureStartOnRead = 0; - } - - int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); - if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - featureEndOnRead = readLength; - } - - if( featureStartOnRead > readLength ) { - featureStartOnRead = featureEndOnRead = readLength; - } - - Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true); - } - return knownSites; - } - - // BUGBUG: can be merged with calculateIsIndel - protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) { - final byte[] readBases = read.getReadBases(); - final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd()); - final int[] snp = new int[readBases.length]; - int readPos = 0; - int refPos = 0; - for ( final CigarElement ce : read.getCigar().getCigarElements() ) { - final int elementLength = ce.getLength(); - switch (ce.getOperator()) { - case M: - case EQ: - case X: - for( int iii = 0; iii < elementLength; iii++ ) { - snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 ); - readPos++; - refPos++; - } - break; - case D: - case N: - refPos += elementLength; - break; - case I: - case S: // ReferenceContext doesn't have the soft clipped bases! - readPos += elementLength; - break; - case H: - case P: - break; - default: - throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); - } - } - return snp; - } - - protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) { - final int[] indel = new int[read.getReadBases().length]; - int readPos = 0; - for ( final CigarElement ce : read.getCigar().getCigarElements() ) { - final int elementLength = ce.getLength(); - switch (ce.getOperator()) { - case M: - case EQ: - case X: - case S: - { - readPos += elementLength; - break; - } - case D: - { - final int index = ( read.getReadNegativeStrandFlag() ? readPos : readPos - 1 ); - updateIndel(indel, index, mode, EventType.BASE_DELETION); - break; - } - case I: - { - final boolean forwardStrandRead = !read.getReadNegativeStrandFlag(); - if( forwardStrandRead ) { - updateIndel(indel, readPos - 1, mode, EventType.BASE_INSERTION); - } - readPos += elementLength; - if( !forwardStrandRead ) { - updateIndel(indel, readPos, mode, EventType.BASE_INSERTION); - } - break; - } - case N: - case H: - case P: - break; - default: - throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); - } - } - return indel; - } - - private static void updateIndel(final int[] indel, final int index, final EventType mode, final EventType requiredMode) { - if ( mode == requiredMode && index >= 0 && index < indel.length ) - // protect ourselves from events at the start or end of the read (1D3M or 3M1D) - indel[index] = 1; - } - - protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) { - if(errorArray.length != baqArray.length ) { - throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); - } - - final int BLOCK_START_UNSET = -1; - - final double[] fractionalErrors = new double[baqArray.length]; - Arrays.fill(fractionalErrors, 0.0); - boolean inBlock = false; - int blockStartIndex = BLOCK_START_UNSET; - int iii; - for( iii = 0; iii < fractionalErrors.length; iii++ ) { - if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) { - if( !inBlock ) { - fractionalErrors[iii] = (double) errorArray[iii]; - } else { - calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors); - inBlock = false; // reset state variables - blockStartIndex = BLOCK_START_UNSET; // reset state variables - } - } else { - inBlock = true; - if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; } - } - } - if( inBlock ) { - calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors); - } - if( fractionalErrors.length != errorArray.length ) { - throw new ReviewedStingException("Output array length mismatch detected. Malformed read?"); - } - return fractionalErrors; - } - - private static void calculateAndStoreErrorsInBlock( final int iii, - final int blockStartIndex, - final int[] errorArray, - final double[] fractionalErrors ) { - int totalErrors = 0; - for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) { - totalErrors += errorArray[jjj]; - } - for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) { - fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1)); - } - } - - /** - * Create a BAQ style array that indicates no alignment uncertainty - * @param read the read for which we want a BAQ array - * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values - * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above - */ - protected static byte[] flatBAQArray(final GATKSAMRecord read) { - final byte[] baq = new byte[read.getReadLength()]; - Arrays.fill(baq, NO_BAQ_UNCERTAINTY); - return baq; - } - - /** - * Compute an actual BAQ array for read, based on its quals and the reference sequence - * @param read the read to BAQ - * @return a non-null BAQ tag array for read - */ - private byte[] calculateBAQArray( final GATKSAMRecord read ) { - baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); - return BAQ.getBAQTag(read); - } - - /** - * Initialize the reduce step by returning 0L - * - * @return returns 0L - */ - public Long reduceInit() { - return 0L; - } - - /** - * The Reduce method doesn't do anything for this walker. - * - * @param mapped Result of the map. This value is immediately ignored. - * @param sum The summing CountedData used to output the CSV data - * @return returns The sum used to output the CSV data - */ - public Long reduce(Long mapped, Long sum) { - sum += mapped; - return sum; - } - - @Override - public void onTraversalDone(Long result) { - recalibrationEngine.finalizeData(); - - logger.info("Calculating quantized quality scores..."); - quantizeQualityScores(); - - logger.info("Writing recalibration report..."); - generateReport(); - logger.info("...done!"); - - logger.info("BaseRecalibrator was able to recalibrate " + result + " reads"); - } - - private RecalibrationTables getRecalibrationTable() { - return recalibrationEngine.getFinalRecalibrationTables(); - } - - /** - * go through the quality score table and use the # observations and the empirical quality score - * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to - * generate a quantization map (recalibrated_qual -> quantized_qual) - */ - private void quantizeQualityScores() { - quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS); - } - - private void generateReport() { - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java deleted file mode 100644 index 271617059..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.recalibration.*; - -import java.io.*; - -/** - * Evaluate the performance of the base recalibration process - * - *

This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.

- * - *

Caveat

- *

This tool is currently experimental. We do not provide documentation nor support for its operation.

- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) -@PartitionBy(PartitionType.READ) -public class RecalibrationPerformance extends RodWalker implements NanoSchedulable { - - @Output - public PrintStream out; - - @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file") - public File RECAL_FILE = null; - - public void initialize() { - out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors"); - - final GATKReport report = new GATKReport(RECAL_FILE); - final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); - for ( int row = 0; row < table.getNumRows(); row++ ) { - - final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); - final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME))); - final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); - - final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); - - final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported); - - //if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 ) - // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp)); - - if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") && - table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") && - table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") && - (QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) { - out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors)); - out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors)); - } - } - - } - - @Override - public boolean isDone() { - return true; - } - - private double asDouble(final Object o) { - if ( o instanceof Double ) - return (Double)o; - else if ( o instanceof Integer ) - return (Integer)o; - else if ( o instanceof Long ) - return (Long)o; - else - throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass()); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; } - - @Override - public Integer reduceInit() { return 0; } - - @Override - public Integer reduce(Integer counter, Integer sum) { return 0; } - - @Override - public void onTraversalDone(Integer sum) {} -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java deleted file mode 100644 index 28a48c212..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ /dev/null @@ -1,207 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -/** - * An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base. - * - * @author Mauricio Carneiro - * @since 6/15/12 - */ -public class BaseAndQualsCounts extends BaseCounts { - - private long sumInsertionQual_A = 0; - private long sumDeletionQual_A = 0; - private long sumInsertionQual_C = 0; - private long sumDeletionQual_C = 0; - private long sumInsertionQual_G = 0; - private long sumDeletionQual_G = 0; - private long sumInsertionQual_T = 0; - private long sumDeletionQual_T = 0; - private long sumInsertionQual_D = 0; - private long sumDeletionQual_D = 0; - private long sumInsertionQual_I = 0; - private long sumDeletionQual_I = 0; - private long sumInsertionQual_N = 0; - private long sumDeletionQual_N = 0; - - /* - * Increments the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - */ - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { - incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); - } - - /* - * Increments the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - * @param isSoftClip true if is soft-clipped - */ - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { - // if we already have high quality bases, ignore low quality ones - if ( isLowQualBase && !isLowQuality() ) - return; - - // if this is a high quality base then remove any low quality bases and start from scratch - if ( !isLowQualBase && isLowQuality() ) { - if ( totalCount() > 0 ) - clear(); - setLowQuality(false); - } - - final BaseIndex i = BaseIndex.byteToBase(base); - super.incr(i, baseQual, baseMappingQual, isSoftClip); - switch (i) { - case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; - case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; - case G: sumInsertionQual_G += insQual; sumDeletionQual_G += delQual; break; - case T: sumInsertionQual_T += insQual; sumDeletionQual_T += delQual; break; - case D: sumInsertionQual_D += insQual; sumDeletionQual_D += delQual; break; - case I: sumInsertionQual_I += insQual; sumDeletionQual_I += delQual; break; - case N: sumInsertionQual_N += insQual; sumDeletionQual_N += delQual; break; - } - } - - /* - * Decrements the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - */ - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { - decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); - } - - /* - * Decrements the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - * @param isSoftClip true if is soft-clipped - */ - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { - // if this is not the right type of base, ignore it - if ( isLowQualBase != isLowQuality() ) - return; - - final BaseIndex i = BaseIndex.byteToBase(base); - super.decr(i, baseQual, baseMappingQual, isSoftClip); - switch (i) { - case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; - case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; - case G: sumInsertionQual_G -= insQual; sumDeletionQual_G -= delQual; break; - case T: sumInsertionQual_T -= insQual; sumDeletionQual_T -= delQual; break; - case D: sumInsertionQual_D -= insQual; sumDeletionQual_D -= delQual; break; - case I: sumInsertionQual_I -= insQual; sumDeletionQual_I -= delQual; break; - case N: sumInsertionQual_N -= insQual; sumDeletionQual_N -= delQual; break; - } - } - - public byte averageInsertionQualsOfBase(final BaseIndex base) { - return (byte) (getInsertionQual(base) / countOfBase(base)); - } - - public byte averageDeletionQualsOfBase(final BaseIndex base) { - return (byte) (getDeletionQual(base) / countOfBase(base)); - } - - private long getInsertionQual(final BaseIndex base) { - switch (base) { - case A: return sumInsertionQual_A; - case C: return sumInsertionQual_C; - case G: return sumInsertionQual_G; - case T: return sumInsertionQual_T; - case D: return sumInsertionQual_D; - case I: return sumInsertionQual_I; - case N: return sumInsertionQual_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - private long getDeletionQual(final BaseIndex base) { - switch (base) { - case A: return sumDeletionQual_A; - case C: return sumDeletionQual_C; - case G: return sumDeletionQual_G; - case T: return sumDeletionQual_T; - case D: return sumDeletionQual_D; - case I: return sumDeletionQual_I; - case N: return sumDeletionQual_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - /** - * Clears out all stored data in this object - */ - public void clear() { - super.clear(); - sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0; - sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java deleted file mode 100644 index e1329db3b..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ /dev/null @@ -1,411 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import org.broadinstitute.sting.utils.MathUtils; - - -/** - * An object to keep track of the number of occurrences of each base and it's quality. - * - * User: depristo - * Date: 4/8/11 - * Time: 2:55 PM - */ - - public class BaseCounts { - public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; - public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - - - private int count_A = 0; // keeps track of the base counts - private int sumQual_A = 0; // keeps track of the quals of each base - private int count_C = 0; - private int sumQual_C = 0; - private int count_G = 0; - private int sumQual_G = 0; - private int count_T = 0; - private int sumQual_T = 0; - private int count_D = 0; - private int sumQual_D = 0; - private int count_I = 0; - private int sumQual_I = 0; - private int count_N = 0; - private int sumQual_N = 0; - private int totalCount = 0; // keeps track of total count since this is requested so often - private int nSoftClippedBases = 0; - private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this - private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise - - - public static BaseCounts createWithCounts(int[] countsACGT) { - BaseCounts baseCounts = new BaseCounts(); - baseCounts.count_A = countsACGT[0]; - baseCounts.count_C = countsACGT[1]; - baseCounts.count_G = countsACGT[2]; - baseCounts.count_T = countsACGT[3]; - baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; - return baseCounts; - } - - @Requires("other != null") - public void add(final BaseCounts other) { - this.count_A += other.count_A; - this.count_C += other.count_C; - this.count_G += other.count_G; - this.count_T += other.count_T; - this.count_D += other.count_D; - this.count_I += other.count_I; - this.count_N += other.count_N; - this.totalCount += other.totalCount; - this.nSoftClippedBases = other.nSoftClippedBases; - this.mappingQualities.addAll(other.mappingQualities); - } - - @Requires("other != null") - public void sub(final BaseCounts other) { - this.count_A -= other.count_A; - this.count_C -= other.count_C; - this.count_G -= other.count_G; - this.count_T -= other.count_T; - this.count_D -= other.count_D; - this.count_I -= other.count_I; - this.count_N -= other.count_N; - this.totalCount -= other.totalCount; - this.nSoftClippedBases -= other.nSoftClippedBases; - this.mappingQualities.removeAll(other.mappingQualities); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final byte base) { - add(BaseIndex.byteToBase(base), 1); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { - switch (base) { - case A: ++count_A; sumQual_A += qual; break; - case C: ++count_C; sumQual_C += qual; break; - case G: ++count_G; sumQual_G += qual; break; - case T: ++count_T; sumQual_T += qual; break; - case D: ++count_D; sumQual_D += qual; break; - case I: ++count_I; sumQual_I += qual; break; - case N: ++count_N; sumQual_N += qual; break; - } - ++totalCount; - nSoftClippedBases += isSoftclip ? 1 : 0; - mappingQualities.add(mappingQuality); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final byte base) { - add(BaseIndex.byteToBase(base), -1); - } - - private void add(final BaseIndex base, int amount) { - switch(base) { - case A: count_A += amount; break; - case C: count_C += amount; break; - case G: count_G += amount; break; - case T: count_T += amount; break; - case D: count_D += amount; break; - case I: count_I += amount; break; - case N: count_N += amount; break; - } - totalCount += amount; - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { - switch (base) { - case A: --count_A; sumQual_A -= qual; break; - case C: --count_C; sumQual_C -= qual; break; - case G: --count_G; sumQual_G -= qual; break; - case T: --count_T; sumQual_T -= qual; break; - case D: --count_D; sumQual_D -= qual; break; - case I: --count_I; sumQual_I -= qual; break; - case N: --count_N; sumQual_N -= qual; break; - } - --totalCount; - nSoftClippedBases -= isSoftclip ? 1 : 0; - mappingQualities.remove((Integer) mappingQuality); - } - - @Ensures("result >= 0") - public long getSumQuals(final byte base) { - return getSumQuals(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public long getSumQuals(final BaseIndex base) { - switch (base) { - case A: return sumQual_A; - case C: return sumQual_C; - case G: return sumQual_G; - case T: return sumQual_T; - case D: return sumQual_D; - case I: return sumQual_I; - case N: return sumQual_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - @Ensures("result >= 0") - public byte averageQuals(final byte base) { - return averageQuals(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public byte averageQuals(final BaseIndex base) { - return (byte) (getSumQuals(base) / countOfBase(base)); - } - - @Ensures("result >= 0") - public int countOfBase(final byte base) { - return countOfBase(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public int countOfBase(final BaseIndex base) { - switch (base) { - case A: return count_A; - case C: return count_C; - case G: return count_G; - case T: return count_T; - case D: return count_D; - case I: return count_I; - case N: return count_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - @Ensures("result >= 0") - public long sumQualsOfBase(final BaseIndex base) { - return getSumQuals(base); - } - - @Ensures("result >= 0") - public byte averageQualsOfBase(final BaseIndex base) { - return (byte) (sumQualsOfBase(base) / countOfBase(base)); - } - - @Ensures("result >= 0") - public int nSoftclips() { - return nSoftClippedBases; - } - - @Ensures("result >= 0") - public int totalCount() { - return totalCount; - } - - /** - * The RMS of the mapping qualities of all reads that contributed to this object - * - * @return the RMS of the mapping qualities of all reads that contributed to this object - */ - public double getRMS() { - return MathUtils.rms(mappingQualities); - } - - /** - * Given a base , it returns the proportional count of this base compared to all other bases - * - * @param base base - * @return the proportion of this base over all other bases - */ - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(final byte base) { - return baseCountProportion(BaseIndex.byteToBase(base)); - } - - /** - * Given a base , it returns the proportional count of this base compared to all other bases - * - * @param baseIndex base - * @return the proportion of this base over all other bases - */ - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(final BaseIndex baseIndex) { - return (totalCount == 0) ? 0.0 : (double)countOfBase(baseIndex) / (double)totalCount; - } - - @Ensures("result != null") - public String toString() { - StringBuilder b = new StringBuilder(); - for (final BaseIndex i : BaseIndex.values()) { - b.append(i.toString()).append("=").append(countOfBase(i)).append(","); - } - return b.toString(); - } - - public byte baseWithMostCounts() { - return baseIndexWithMostCounts().getByte(); - } - - /** - * @return the base index for which the count is highest, including indel indexes - */ - @Ensures("result != null") - public BaseIndex baseIndexWithMostCounts() { - return baseIndexWithMostCounts(true); - } - - /** - * @return the base index for which the count is highest, excluding indel indexes - */ - @Ensures("result != null") - public BaseIndex baseIndexWithMostCountsWithoutIndels() { - return baseIndexWithMostCounts(false); - } - - /** - * Finds the base index with the most counts - * - * @param allowIndels should we allow base indexes representing indels? - * @return non-null base index - */ - @Ensures("result != null") - protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - int maxCount = countOfBase(maxI); - - for (final BaseIndex i : BaseIndex.values()) { - if ( !allowIndels && !i.isNucleotide() ) - continue; - - final int myCount = countOfBase(i); - if (myCount > maxCount) { - maxI = i; - maxCount = myCount; - } - } - return maxI; - } - - public byte baseWithMostProbability() { - return baseIndexWithMostProbability().getByte(); - } - - @Ensures("result != null") - public BaseIndex baseIndexWithMostProbability() { - return baseIndexWithMostProbability(true); - } - - @Ensures("result != null") - public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { - return baseIndexWithMostProbability(false); - } - - /** - * Finds the base index with the most probability - * - * @param allowIndels should we allow base indexes representing indels? - * @return non-null base index - */ - @Ensures("result != null") - public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - long maxSum = getSumQuals(maxI); - - for (final BaseIndex i : BaseIndex.values()) { - if ( !allowIndels && !i.isNucleotide() ) - continue; - - final long mySum = getSumQuals(i); - if (mySum > maxSum) { - maxI = i; - maxSum = mySum; - } - } - return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels)); - } - - @Ensures("result >=0") - public int totalCountWithoutIndels() { - return totalCount - countOfBase(BaseIndex.D) - countOfBase(BaseIndex.I); - } - - /** - * Calculates the proportional count of a base compared to all other bases except indels (I and D) - * - * @param base base - * @return the proportion of this base over all other bases except indels - */ - @Requires("base.isNucleotide()") - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportionWithoutIndels(final BaseIndex base) { - final int total = totalCountWithoutIndels(); - return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total; - } - - /** - * @return true if this instance represents low quality bases - */ - public boolean isLowQuality() { return isLowQuality; } - - /** - * Sets the low quality value - * - * @param value true if this instance represents low quality bases false otherwise - */ - public void setLowQuality(final boolean value) { isLowQuality = value; } - - /** - * Clears out all stored data in this object - */ - public void clear() { - count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0; - sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0; - totalCount = 0; - nSoftClippedBases = 0; - mappingQualities.clear(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java deleted file mode 100644 index 665e3e7ce..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java +++ /dev/null @@ -1,136 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Simple byte / base index conversions - * - * - * @author carneiro - * @since 8/26/11 - */ -public enum BaseIndex { - A ( 'A', 0 ), - C ( 'C', 1 ), - G ( 'G', 2 ), - T ( 'T', 3 ), - D ( 'D', 4 ), - I ( 'I', 5 ), // insertion to the right of the base - N ( 'N', 6 ); - - final byte b; - final int index; - - public byte getByte() { return b; } - - /** - * Ordinal is stored in SyntheticRead rather than enum to save object reference, and store as byte for compactness. - * It is stored as byte, and this method merely eliminates a cast. - */ - public byte getOrdinalByte() { return (byte)ordinal(); } - - private BaseIndex(char base, int index) { - this.b = (byte)base; - this.index = index; - } - - /** - * Converts a byte representation of a base to BaseIndex - * - * @param base the byte representation of the base - * @return the BaseIndex representation of the base; - */ - public static BaseIndex byteToBase(final byte base) { - switch (base) { - case 'A': - case 'a': - return A; - case 'C': - case 'c': - return C; - case 'G': - case 'g': - return G; - case 'T': - case 't': - return T; - case 'D': - case 'd': - case '-': - return D; - case 'I': - case 'i': - return I; - case 'N': - case 'n': - return N; - default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base); - } - } - - /** - * Definition of a nucleotide for the BaseIndex is anything that has been read as a base - * by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows - * there is a base there (N). - * - * @return whether or not it is a nucleotide, given the definition above - */ - public final boolean isNucleotide() { - return !isIndel(); - } - - /** - * Whether or not this base is an insertion or a deletion - * - * @return true for I or D, false otherwise - */ - public final boolean isIndel() { - return this == D || this == I; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java deleted file mode 100644 index 36da92b4f..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java +++ /dev/null @@ -1,232 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; - -import java.util.HashMap; -import java.util.Map; - -/** - * Given two BAMs with different read groups, it compares them based on ReduceReads metrics. - *

- * This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files. - *

- *

Input

- *

- * Two BAM files (using -I) with different read group IDs - *

- *

Output

- *

- * [Output description] - *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T $WalkerName
- *  
- * - * @author carneiro - * @since 10/30/11 - */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) -public class CompareBAM extends LocusWalker, CompareBAM.TestResults> { - @Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID; - @Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false; - @Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false; - @Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20; - @Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20; - - - @Override - public Map map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Map result = new HashMap(); - - if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context)); - if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context)); - - return result; - } - - @Override - public TestResults reduceInit () { - TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP. - - if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES); - if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS); - - return sum; - } - - @Override - public TestResults reduce (Map mapResult, TestResults sum) { - for (TestName test : mapResult.keySet()) { - if (mapResult.get(test)) - sum.reportSuccess(test); - else - sum.reportFailed(test); - } - - return sum; - } - - public void onTraversalDone (TestResults finalResults) { - finalResults.report(); - } - - private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) { - return true; - } - - private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) { - - return true; - } - - public enum TestName { - EQUAL_BASES ("testEqualBases"), - BASE_COUNTS ("testBaseCounts"); - - private String testName; - - TestName(String testName) { - this.testName = testName; - } - - public String getTestName() { - return testName; - } - } - - public class TestResults { - private Map testStats = new HashMap(); - - public void createTest (TestName test) { - testStats.put(test, new TestOutcome()); - } - - public void reportSuccess(TestName test) { - if (testStats.containsKey(test)) - testStats.get(test).incPassed(); - else - throw new ReviewedStingException("No such test: " + test); - } - - public void reportFailed(TestName test) { - if (testStats.containsKey(test)) - testStats.get(test).incFailed(); - else - throw new ReviewedStingException("No such test: " + test); - } - - public void report() { - System.out.println(); - System.out.println(String.format("%20s\tPASS\tFAIL", "")); - for (TestName test : testStats.keySet()) - System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed())); - System.out.println(); - } - } - - private class TestOutcome { - private long passed; - private long failed; - - public long getPassed() { - return passed; - } - - public void incPassed() { - this.passed++; - } - - public long getFailed() { - return failed; - } - - public void incFailed() { - this.failed++; - } - } - - private BaseCounts getFilteredBaseCounts(AlignmentContext context) { - return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL); - } - - private BaseCounts getFullBaseCounts(AlignmentContext context) { - return getBaseCounts(context, 3, 0); - } - - private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) { - BaseCounts fullBaseCounts = new BaseCounts(); - for (String rg : context.getBasePileup().getReadGroups()) { - if (!rg.equals(reducedReadGroupID)) { - BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts()); - fullBaseCounts.add(b); - } - } - return fullBaseCounts; - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java deleted file mode 100644 index 22ea78521..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet; -import it.unimi.dsi.fastutil.objects.ObjectSortedSet; -import org.broadinstitute.sting.utils.*; - -import java.util.Collection; - - -/** - * A stash of regions that must be kept uncompressed in all samples - * - * In general, these are regions that were kept uncompressed by a tumor sample and we want to force - * all other samples (normals and/or tumors) to also keep these regions uncompressed - * - * User: carneiro - * Date: 10/15/12 - * Time: 4:08 PM - */ -public class CompressionStash extends ObjectAVLTreeSet { - public CompressionStash() { - super(); - } - - /** - * Adds a UnvalidatingGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc - * in the stash. - * - * @param insertLoc the new loc to be inserted - * @return true if the loc, or it's merged version, wasn't present in the list before. - */ - @Override - public boolean add(final FinishedGenomeLoc insertLoc) { - ObjectSortedSet removedLocs = new ObjectAVLTreeSet(); - for (FinishedGenomeLoc existingLoc : this) { - if (existingLoc.isPast(insertLoc)) { - break; // if we're past the loc we're done looking for overlaps. - } - if (existingLoc.equals(insertLoc)) { - return false; // if this loc was already present in the stash, we don't need to insert it. - } - if (existingLoc.contiguousP(insertLoc)) { - removedLocs.add(existingLoc); // list the original loc for merging - } - } - - this.removeAll(removedLocs); // remove all locs that will be merged - removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged - - return super.add(new FinishedGenomeLoc(GenomeLoc.merge(removedLocs), insertLoc.isFinished())); - } - - @Override - public boolean addAll(Collection locs) { - boolean result = false; - for (final FinishedGenomeLoc loc : locs) { - result |= this.add(loc); - } - return result; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java deleted file mode 100644 index 1c0336ebf..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java +++ /dev/null @@ -1,108 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 4/10/11 - * Time: 8:49 AM - * - * A general interface for ReadCompressors. Read compressors have the following semantics: - * - * The accept a stream of reads, in order, and after each added read returns a compressed stream - * of reads for emission. This stream of reads is a "reduced" representation of the total stream - * of reads. The actual compression approach is left up to the implementing class. - */ -public interface Compressor { - /** - * Adds the read to the compressor. The returned iteratable collection of - * reads represents the incremental compressed output. - * @param read the next uncompressed read in the input stream to the compressor - * @return an iterator over the incrementally available compressed reads - */ - @Requires("read != null") - @Ensures("result != null") - Iterable addAlignment(GATKSAMRecord read); - - /** - * Must be called after the last read has been added to finalize the compressor state - * and return the last compressed reads from the compressor. - * @return an iterator over the final compressed reads of this compressor - */ - @Ensures("result != null") - Iterable close(); -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java deleted file mode 100644 index 13010f905..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; - -/** - * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations - * with them. - * - * However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot - * be created easily for small tasks that do not require the rigors of the GenomeLocParser validation - * - * UnvalidatingGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should - * only be used outside of the engine. - * - * User: carneiro - * Date: 10/16/12 - * Time: 2:07 PM - */ -public class FinishedGenomeLoc extends UnvalidatingGenomeLoc { - private boolean finished; - - public FinishedGenomeLoc(final String contigName, final int contigIndex, final int start, final int stop, final boolean finished) { - super(contigName, contigIndex, start, stop); - this.finished = finished; - } - - public FinishedGenomeLoc(final GenomeLoc loc, final boolean finished) { - super(loc.getContig(), loc.getContigIndex(), loc.getStart(), loc.getStop()); - this.finished = finished; - } - - public boolean isFinished() { - return finished; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java deleted file mode 100644 index 5e84076fd..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ /dev/null @@ -1,393 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -/** - * The element that describes the header of the sliding window. - * - * Each site has a header element containing the counts of each base, it's reference based location and whether or - * not the site has insertions (to it's right). It also contains information about the bases that have been filtered - * out due to mapping or base quality. - */ -public class HeaderElement { - private BaseAndQualsCounts positiveConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site. - private BaseAndQualsCounts negativeConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site. - private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site. - private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right - private int location; // Genome location of this site (the sliding window knows which contig we're at - - protected static final int MIN_COUNT_FOR_USING_PVALUE = 2; - - public int getLocation() { - return location; - } - - /** - * Get the base counts object for the consensus type - * - * @param consensusType the type to use - * @return non-null base counts - */ - public BaseAndQualsCounts getBaseCounts(final SlidingWindow.ConsensusType consensusType) { - if ( consensusType == SlidingWindow.ConsensusType.POSITIVE_CONSENSUS ) - return positiveConsensusBaseCounts; - if ( consensusType == SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS ) - return negativeConsensusBaseCounts; - return filteredBaseCounts; - } - - /** - * Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty - * filteredBaseCounts - 0 insertions to the right - empty mappingQuality list - * - * @param location the reference location for the new element - */ - public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location); - } - - /** - * Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty - * filteredBaseCounts - empty mappingQuality list - * - * @param location the reference location for the new element - */ - public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location); - } - - /** - * Creates a new HeaderElement with all given parameters - * - * @param positiveConsensusBaseCounts the BaseCounts object for the running positive consensus synthetic read - * @param negativeConsensusBaseCounts the BaseCounts object for the running negative consensus synthetic read - * @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read - * @param insertionsToTheRight number of insertions to the right of this HeaderElement - * @param location the reference location of this reference element - * HeaderElement - */ - public HeaderElement(final BaseAndQualsCounts positiveConsensusBaseCounts, final BaseAndQualsCounts negativeConsensusBaseCounts, final BaseAndQualsCounts filteredBaseCounts, final int insertionsToTheRight, final int location) { - this.positiveConsensusBaseCounts = positiveConsensusBaseCounts; - this.negativeConsensusBaseCounts = negativeConsensusBaseCounts; - this.filteredBaseCounts = filteredBaseCounts; - this.insertionsToTheRight = insertionsToTheRight; - this.location = location; - } - - /** - * Whether or not the site represented by this HeaderElement is variant according to the definitions of variant - * by insertion, deletion and mismatches. - * - * @param minVariantPvalue min p-value for deciding that a position is or is not variable due to mismatches - * @param minVariantProportion min proportion for deciding that a position is or is not variable due to mismatches - * @param minIndelProportion min proportion for deciding that a position is or is not variable due to indels - * @return true if site is variant by any definition. False otherwise. - */ - public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) { - return ( hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) ) - && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); - } - - /** - * Adds a new base to the HeaderElement updating all counts accordingly - * - * @param base the base to add - * @param baseQual the base quality - * @param insQual the base insertion quality - * @param delQual the base deletion quality - * @param baseMappingQuality the mapping quality of the read this base belongs to - * @param minBaseQual the minimum base qual allowed to be a good base - * @param minMappingQual the minimum mapping qual allowed to be a good read - * @param isSoftClipped true if the base is soft-clipped in the original read - * @param isNegativeStrand true if the base comes from a read on the negative strand - */ - public void addBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) { - // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts - if ( baseMappingQuality >= minMappingQual ) { - if ( isNegativeStrand ) - negativeConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - else - positiveConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - } else { - filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - } - } - - /** - * Adds a new base to the HeaderElement updating all counts accordingly - * - * @param base the base to add - * @param baseQual the base quality - * @param insQual the base insertion quality - * @param delQual the base deletion quality - * @param baseMappingQuality the mapping quality of the read this base belongs to - * @param minBaseQual the minimum base qual allowed to be a good base - * @param minMappingQual the minimum mapping qual allowed to be a good read - * @param isSoftClipped true if the base is soft-clipped in the original read - * @param isNegativeStrand true if the base comes from a read on the negative strand - */ - public void removeBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) { - // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts - if ( baseMappingQuality >= minMappingQual ) { - if ( isNegativeStrand ) - negativeConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - else - positiveConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - } else { - filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - } - } - - /** - * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions - * should be added to the right of the element. - */ - public void addInsertionToTheRight() { - insertionsToTheRight++; - } - - /** - * Does this HeaderElement contain consensus data? - * - * @param consensusType the type to use - * @return whether or not this HeaderElement contains consensus data - */ - public boolean hasConsensusData(final SlidingWindow.ConsensusType consensusType) { - return getBaseCounts(consensusType).totalCount() > 0; - } - - /** - * A HeaderElement is empty if it has no consensus or filtered data - * - * @return whether or not this HeaderElement has no data - */ - public boolean isEmpty() { - return !hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.FILTERED); - } - - /** - * removes an insertion from this element (if you removed a read that had an insertion) - */ - public void removeInsertionToTheRight() { - this.insertionsToTheRight--; - if (insertionsToTheRight < 0) - throw new ReviewedStingException("Removed too many insertions, header is now negative at position " + location); - } - - public boolean hasInsertionToTheRight() { - return insertionsToTheRight > 0; - } - - public int numInsertionsToTheRight() { - return insertionsToTheRight; - } - - /** - * Whether or not the HeaderElement is variant due to excess insertions - * - * @return whether or not the HeaderElement is variant due to excess insertions - */ - private boolean isVariantFromInsertions(double minIndelProportion) { - final int numberOfBases = totalCountForBothStrands(); - if (numberOfBases == 0) - return (insertionsToTheRight > 0); // do we only have insertions? - - // if we have bases and insertions, check the ratio - return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; - } - - private int totalCountForBothStrands() { - return positiveConsensusBaseCounts.totalCount() + negativeConsensusBaseCounts.totalCount(); - } - - /** - * Whether or not the HeaderElement is variant due to excess deletions - * - * @return whether or not the HeaderElement is variant due to excess deletions - */ - private boolean isVariantFromDeletions(double minIndelProportion) { - return positiveConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || positiveConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion - || negativeConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || negativeConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; - } - - /** - * Whether or not the HeaderElement is variant due to excess mismatches - * - * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage). - * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage). - * @return whether or not the HeaderElement is variant due to excess mismatches - */ - protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) { - return isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || - isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS); - } - - /** - * Whether or not the HeaderElement is variant due to excess mismatches - * - * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage). - * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage). - * @param consensusType the consensus type to use - * @return whether or not the HeaderElement is variant due to excess mismatches - */ - private boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion, final SlidingWindow.ConsensusType consensusType) { - final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType); - final int totalCount = baseAndQualsCounts.totalCountWithoutIndels(); - final BaseIndex mostCommon = baseAndQualsCounts.baseIndexWithMostProbabilityWithoutIndels(); - final int countOfOtherBases = totalCount - baseAndQualsCounts.countOfBase(mostCommon); - return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion); - } - - /** - * This handles the special case where we have more bases that came from soft clips than bases that came from - * normal bases by forcing it to become a variant region. We don't want a consensus based on too little information. - * - * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. - */ - protected boolean isVariantFromSoftClips() { - return isVariantFromSoftClips(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || isVariantFromSoftClips(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS); - } - - /** - * This handles the special case where we have more bases that came from soft clips than bases that came from - * normal bases by forcing it to become a variant region. We don't want a consensus based on too little information. - * - * @param consensusType the consensus type to use - * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. - */ - private boolean isVariantFromSoftClips(final SlidingWindow.ConsensusType consensusType) { - final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType); - final int nSoftClippedBases = baseAndQualsCounts.nSoftclips(); - return nSoftClippedBases > 0 && nSoftClippedBases >= (baseAndQualsCounts.totalCount() - nSoftClippedBases); - } - - /** - * Calculates the number of alleles necessary to represent this site. - * - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return the number of alleles necessary to represent this site or -1 if there are too many indels - */ - public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) { - final ObjectArrayList alleles = getAlleles(minVariantPvalue, minVariantProportion); - return alleles == null ? -1 : alleles.size(); - } - - /** - * Calculates the alleles necessary to represent this site. - * - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return the list of alleles necessary to represent this site or null if there are too many indels - */ - public ObjectArrayList getAlleles(final double minVariantPvalue, final double minVariantProportion) { - // make sure we have bases at all - final int totalBaseCount = totalCountForBothStrands(); - if ( totalBaseCount == 0 ) - return new ObjectArrayList<>(0); - - // next, check for insertions; technically, the insertion count can be greater than totalBaseCount - // (because of the way insertions are counted), so we need to account for that - if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) ) - return null; - - // finally, check for the bases themselves (including deletions) - final ObjectArrayList alleles = new ObjectArrayList<>(4); - for ( final BaseIndex base : BaseIndex.values() ) { - final int baseCount = positiveConsensusBaseCounts.countOfBase(base) + negativeConsensusBaseCounts.countOfBase(base); - if ( baseCount == 0 ) - continue; - - if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) { - if ( base == BaseIndex.D ) - return null; - alleles.add(base); - } - } - return alleles; - } - - /* - * Checks whether there are a significant number of softclips. - * - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return true if there are significant softclips, false otherwise - */ - public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) { - return hasSignificantCount(positiveConsensusBaseCounts.nSoftclips() + negativeConsensusBaseCounts.nSoftclips(), totalCountForBothStrands(), minVariantPvalue, minVariantProportion); - } - - /* - * Checks whether there are a significant number of count. - * - * @param count the count (k) to test against - * @param total the total (n) to test against - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return true if there is a significant count given the provided pvalue, false otherwise - */ - private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) { - if ( count == 0 || total == 0 ) - return false; - - // use p-values for low counts of k - if ( count <= MIN_COUNT_FOR_USING_PVALUE ) { - final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count); - return pvalue > minVariantPvalue; - } - - // otherwise, use straight proportions - final int minBaseCountForSignificance = (int)(minVariantProportion * total); - return count >= minBaseCountForSignificance; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java deleted file mode 100644 index bdd407fba..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ /dev/null @@ -1,163 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * - * @author depristo - */ -public class MultiSampleCompressor { - protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); - - protected Object2ObjectMap compressorsPerSample = new Object2ObjectOpenHashMap(); - - public MultiSampleCompressor(SAMFileHeader header, - final int contextSize, - final int downsampleCoverage, - final int minMappingQuality, - final double minAltPValueToTriggerVariant, - final double minAltProportionToTriggerVariant, - final double minIndelProportionToTriggerVariant, - final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { - for ( String name : SampleUtils.getSAMFileSamples(header) ) { - compressorsPerSample.put(name, - new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); - } - } - - /** - * Add an alignment to the compressor - * - * @param read the read to be added - * @param knownSnpPositions the set of known SNP positions - * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) - */ - public ObjectSet addAlignment(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { - String sampleName = read.getReadGroup().getSample(); - SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); - if ( compressor == null ) - throw new ReviewedStingException("No compressor for sample " + sampleName); - Pair, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions); - ObjectSet reads = readsAndStash.getFirst(); - CompressionStash regions = readsAndStash.getSecond(); - - reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions)); - - return reads; - } - - /** - * Properly closes the compressor. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public ObjectSet close(final ObjectSortedSet knownSnpPositions) { - ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { - Pair, CompressionStash> readsAndStash = sample.close(knownSnpPositions); - reads.addAll(readsAndStash.getFirst()); - } - return reads; - } - - /** - * Finalizes current variant regions. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - private ObjectSet closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { - ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - if (!regions.isEmpty()) { - for (SingleSampleCompressor sample : compressorsPerSample.values()) { - reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions)); - } - } - return reads; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java deleted file mode 100644 index 383ba5ee9..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ /dev/null @@ -1,782 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; -import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import it.unimi.dsi.fastutil.objects.ObjectSortedSet; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.util.SequenceUtil; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Collections; -import java.util.List; - - -/** - * Reduces the BAM file using read based compression that keeps only essential information for variant calling - * - *

- * This tool will generate reduced versions of the BAM files that still follow the BAM specification - * and contain all the information necessary to call variants according to the GATK Best Practices recommendations. - * Some options allow you to tune how much compression you want to achieve. The default values have been - * shown to reduce a typical whole exome BAM file by 100x. The higher the coverage, the bigger the - * savings in file size and performance of the downstream tools. - * - *

Input

- *

- * The BAM file to be compressed - *

- * - *

Output

- *

- * The compressed (reduced) BAM file. - * - *

- *

Examples

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T ReduceReads \
- *   -I myData.bam \
- *   -o myData.reduced.bam
- * 
- */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.CONTIG) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40) -public class ReduceReads extends ReadWalker, ReduceReadsStash> { - - @Output(required = false, defaultToStdout = false) - private StingSAMFileWriter out = null; - private SAMFileWriter writerToUse = null; - - /** - * - */ - @Argument(fullName = "context_size", shortName = "cs", doc = "The number of bases to keep around mismatches (potential variation)", required = false) - public int contextSize = 10; - - /** - * Reads that have - * mapping quality below this threshold will not be counted towards consensus, but are still counted - * towards variable regions. - */ - @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "The minimum mapping quality to be considered for the consensus synthetic read", required = false) - public int minMappingQuality = 20; - - /** - * Reads that have - * base quality below this threshold will not be counted towards consensus, but are still counted - * towards variable regions. - */ - @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "The minimum base quality to be considered for the consensus synthetic read", required = false) - public byte minBaseQual = 15; - - /** - * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with - * quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm. - */ - @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) - public byte minTailQuality = 2; - - /** - * Any number of VCF files representing known SNPs to be used for the polyploid-based reduction. - * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored. - * If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present - * in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will - * be triggered anywhere there is a single SNP present in a consensus window. - */ - @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false) - public List> known = Collections.emptyList(); - - /** - * This strips away all extra information of the read -- anything other than bases, quals - * and read group. - */ - @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "Do not simplify read", required = false) - public boolean DONT_SIMPLIFY_READS = false; - - /** - * Note that it is not necessary to turn this on for reads that are not mate paired. - * The program will behave correctly by default in those cases. - */ - @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "Do not hard clip adaptor sequences", required = false) - public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; - - /** - * This option overrides the argument of minimum tail - * quality. - */ - @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "Do not hard clip the low quality tails of the reads", required = false) - public boolean DONT_CLIP_LOW_QUAL_TAILS = false; - - /** - * By default, ReduceReads will hard clip away any low quality soft clipped - * base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant - * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) - */ - @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "Do not use high quality soft-clipped bases", required = false) - public boolean DONT_USE_SOFTCLIPPED_BASES = false; - - /** - * By default, ReduceReads will compress read names to numbers and guarantee - * uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather - * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. - */ - @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "Do not compress read names", required = false) - public boolean DONT_COMPRESS_READ_NAMES = false; - - /** - * The hard clips will happen exactly at the interval border. - */ - @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "Hard clip all incoming reads to the desired intervals", required = false) - public boolean HARD_CLIP_TO_INTERVAL = false; - - /** - * Anything below this will be - * considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that - * this value is used only regions with high coverage. - */ - @Advanced - @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "Minimum proportion of mismatches in a site to trigger a variant region", required = false) - public double minAltProportionToTriggerVariant = 0.05; - - /** - * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to - * trigger polyploid compression). Note that this value is used only regions with low coverage. - */ - @Advanced - @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region", required = false) - public double minAltPValueToTriggerVariant = 0.01; - - /** - * Anything below this will be considered consensus. - */ - @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "Minimum proportion of indels in a site to trigger a variant region", required = false) - public double minIndelProportionToTriggerVariant = 0.05; - - /** - * This level of downsampling only happens after the region has been evaluated, therefore it can - * be combined with the engine level downsampling. - * A value of 0 turns downsampling off. - */ - @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "Downsample the number of reads emitted per sample in a variant region for better compression", required = false) - public int downsampleCoverage = 250; - - /** - * Generally, this tool is not meant to be run for more than 1 sample at a time. The one valid exception - * brought to our attention by colleagues is the specific case of tumor/normal pairs in cancer analysis. - * To prevent users from unintentionally running the tool in a less than ideal manner, we require them - * to explicitly enable multi-sample analysis with this argument. - */ - @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "Enable multi-sample reduction for cancer analysis", required = false) - public boolean ALLOW_MULTIPLE_SAMPLES = false; - - @Hidden - @Argument(fullName = "nwayout", shortName = "nw", doc = "Generate separate output files per input file", required = false) - public boolean nwayout = false; - - @Hidden - @Argument(fullName = "", shortName = "dl", doc = "Debug level", required = false) - public int debugLevel = 0; - - @Hidden - @Argument(fullName = "", shortName = "dr", doc = "Debug read", required = false) - public String debugRead = ""; - - @Hidden - @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "Downsampling strategy", required = false) - public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; - - @Hidden - @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags", required = false) - public boolean NO_PG_TAG = false; - - public enum DownsampleStrategy { - Normal, - Adaptive - } - - int nCompressedReads = 0; - - private static int READ_NAME_HASH_DEFAULT_SIZE = 1000; - Long nextReadNumber = 1L; // The next number to use for the compressed read name. - Object2LongOpenHashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). - - ObjectSortedSet intervalList; - - ObjectSortedSet knownSnpPositions; - - // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER - public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag - private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; - - /** - * Basic generic initialization of the readNameHash and the intervalList. Output initialization - * is done at the reduceInit method - */ - @Override - public void initialize() { - super.initialize(); - - if ( !nwayout && out == null ) - throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes"); - - if ( nwayout && out != null ) - throw new UserException.CommandLineException("--out and --nwayout cannot be used simultaneously; please use one or the other"); - - if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) - throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); - - if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 ) - throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); - - if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES ) - throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis. If that is what you want to do, use the -cancer_mode flag."); - - if ( known.isEmpty() ) - knownSnpPositions = null; - else - knownSnpPositions = new ObjectAVLTreeSet(); - - GenomeAnalysisEngine toolkit = getToolkit(); - this.resetReadNameHash(); // prepare the read name hash to keep track of what reads have had their read names compressed - intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode - - if (toolkit.getIntervals() != null) - intervalList.addAll(toolkit.getIntervals()); - - final boolean indexOnTheFly = true; - final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; - if (nwayout) { - SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); - writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, false, indexOnTheFly, NO_PG_TAG, programRecord, true); - } - else { - writerToUse = out; - out.setPresorted(false); - if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME); - } - } - } - - /** Initializer for {@link #readNameHash}. */ - private void resetReadNameHash() { - // If the hash grows large, subsequent clear operations can be very expensive, so trim the hash down if it grows beyond its default. - if (readNameHash == null || readNameHash.size() > READ_NAME_HASH_DEFAULT_SIZE) { - readNameHash = new Object2LongOpenHashMap(READ_NAME_HASH_DEFAULT_SIZE); - } else { - readNameHash.clear(); - } - } - - /** - * Takes in a read and prepares it for the SlidingWindow machinery by performing the - * following optional clipping operations: - * 1. Hard clip adaptor sequences - * 2. Hard clip low quality tails - * 3. Hard clip all remaining soft clipped bases - * 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads) - * - * @param ref default map parameter - * @param read default map parameter - * @param metaDataTracker default map parameter - * @return a linked list with all the reads produced by the clipping operations - */ - @Override - public ObjectArrayList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - ObjectArrayList mappedReads; - if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) - System.out.println("Found debug read!"); - - if (debugLevel == 1) - System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd()); - - // we write the actual alignment starts to their respective alignment shift tags in the temporary - // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file - read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart()); - read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd()); - - // Check if the read goes beyond the boundaries of the chromosome, and hard clip those boundaries. - int chromosomeLength = ref.getGenomeLocParser().getContigInfo(read.getReferenceName()).getSequenceLength(); - if (read.getSoftStart() < 0) - read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart()); - if (read.getSoftEnd() > chromosomeLength) - read = ReadClipper.hardClipByReadCoordinates(read, chromosomeLength - read.getSoftStart() + 1, read.getReadLength() - 1); - - if (!DONT_SIMPLIFY_READS) - read.simplify(); // Clear all unnecessary attributes - if (!DONT_CLIP_ADAPTOR_SEQUENCES) - read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any. - if (!DONT_CLIP_LOW_QUAL_TAILS) - read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails - if (!isWholeGenome()) { - if (HARD_CLIP_TO_INTERVAL) - mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval - else { - mappedReads = new ObjectArrayList(); - mappedReads.add(read); - } - } - else { - mappedReads = new ObjectArrayList(); - if (!read.isEmpty()) - mappedReads.add(read); - } - - if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) { - ObjectArrayList tempList = new ObjectArrayList(); - for (GATKSAMRecord mRead : mappedReads) { - GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual); - if (!clippedRead.isEmpty()) - tempList.add(clippedRead); - } - mappedReads = tempList; - } - - if (debugLevel == 1) - for (GATKSAMRecord mappedRead : mappedReads) - System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd()); - - // add the SNPs to the list of known positions - populateKnownSNPs(metaDataTracker); - - return mappedReads; - } - - /* - * Add the positions of known SNPs to the set so that we can keep track of it - * - * @param metaDataTracker the ref meta data tracker - */ - protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) { - for ( final VariantContext vc : metaDataTracker.getValues(known) ) { - if ( vc.isSNP() ) - knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc)); - } - } - - /** - * Initializes the ReduceReadsStash that keeps track of all reads that are waiting to - * enter the SlidingWindow machinery. The stash makes sure reads are served in order - * even though map() may generate reads that are only supposed to enter the machinery - * in the future. - * - * @return the empty stash - */ - @Override - public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); - } - - /** - * Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process - * all reads that come before the original read (the read that was passed to map) including the original - * read. This is where we send reads, in order, to the SlidingWindow machinery. - * - * @param mappedReads the list of reads sent by map - * @param stash the stash that keeps the reads in order for processing - * @return the stash with all reads that have not been processed yet - */ - public ReduceReadsStash reduce(ObjectArrayList mappedReads, ReduceReadsStash stash) { - if (debugLevel == 1) - stash.print(); - - boolean firstRead = true; - for (GATKSAMRecord read : mappedReads) { - boolean originalRead = firstRead && isOriginalRead(mappedReads, read); - - if (read.getReadLength() == 0) - throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); - - if (originalRead) { - ObjectArrayList readsReady = new ObjectArrayList(); - readsReady.addAll(stash.getAllReadsBefore(read)); - readsReady.add(read); - - for (GATKSAMRecord readReady : readsReady) { - if (debugLevel == 1) - System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd()); - - for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions)) - outputRead(compressedRead); - - // We only care about maintaining the link between read pairs if they are in the same variant - // region. Since an entire variant region's worth of reads is returned in a single call to - // stash.compress(), the readNameHash can be cleared after the for() loop above. - // The advantage of clearing the hash is that otherwise it holds all reads that have been encountered, - // which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory. - this.resetReadNameHash(); - - } - } else - stash.add(read); - - firstRead = false; - } - - // reduce memory requirements by removing old positions - if ( !mappedReads.isEmpty() ) - clearStaleKnownPositions(mappedReads.get(0)); - - return stash; - } - - /** - * Now that now more reads will come, we process all the remaining reads in the stash, in order. - * - * @param stash the ReduceReadsStash with all unprocessed reads (from reduce) - */ - @Override - public void onTraversalDone(ReduceReadsStash stash) { - - // output any remaining reads in the compressor - for (GATKSAMRecord read : stash.close(knownSnpPositions)) - outputRead(read); - - if (nwayout) - writerToUse.close(); - } - - /** - * Removes known positions that are no longer relevant for use with het compression. - * - * @param read the current read, used for checking whether there are stale positions we can remove - */ - protected void clearStaleKnownPositions(final GATKSAMRecord read) { - // nothing to clear if not used or empty - if ( knownSnpPositions == null || knownSnpPositions.isEmpty() ) - return; - - // not ready to be cleared until we encounter a read from a different contig - final int contigIndexOfRead = read.getReferenceIndex(); - if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead ) - return; - - // because we expect most elements to be stale, it's not going to be efficient to remove them one at a time - final ObjectAVLTreeSet goodLocs = new ObjectAVLTreeSet(); - for ( final GenomeLoc loc : knownSnpPositions ) { - if ( loc.getContigIndex() == contigIndexOfRead ) - goodLocs.add(loc); - } - knownSnpPositions.clear(); - knownSnpPositions.addAll(goodLocs); - } - - /** - * Hard clips away all parts of the read that doesn't agree with the intervals selected. - * - * Note: If read overlaps more than one interval, it will be hard clipped to all - * the intervals it overlaps with - * - * @param read the read to be hard clipped to the interval. - * @return a shallow copy of the read hard clipped to the interval - */ - private ObjectArrayList hardClipReadToInterval(GATKSAMRecord read) { - ObjectArrayList clippedReads = new ObjectArrayList(); - - GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list) - - boolean originalRead = true; // false if this is the right tail of the original read - boolean overlap; // keeps track of the interval that overlapped the original read - boolean doneClipping; // triggers an early exit if we are done clipping this read - - if (isWholeGenome()) - clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged - - for (GenomeLoc interval : intervalList) { - - if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before) - break; - - GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch - - switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) { - case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read - if (!originalRead) // something went wrong if this is the tail of the read - throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - overlap = false; - doneClipping = false; - break; - - - case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore - if (originalRead) { - overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read). - clippedRead = GATKSAMRecord.emptyRead(read); - } else - overlap = false; - - doneClipping = false; - break; - - case NO_OVERLAP_CONTIG: // read is in a different contig - if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one. - if (read.getReferenceIndex() < interval.getContigIndex()) - throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - else { - overlap = false; - doneClipping = false; - } - } // tail read CANNOT be in a different contig. - else { - if (read.getReferenceIndex() < interval.getContigIndex()) { - overlap = false; - doneClipping = true; - } else - throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - - } - break; - - case NO_OVERLAP_LEFT: - if (originalRead) // if this is the first read this should never happen. - throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - - overlap = false; - doneClipping = true; - break; - - case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore - overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here. - doneClipping = true; - break; - - case OVERLAP_LEFT: // clip the left tail of the read - clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1); - - overlap = true; - doneClipping = true; - break; - - case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval - clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1); - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop()); - - overlap = true; - doneClipping = false; - break; - - case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read - clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1); - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop()); - - overlap = true; - doneClipping = false; - break; - - case OVERLAP_CONTAINED: // don't do anything to the read - clippedRead = read; - - overlap = true; - doneClipping = true; - break; - - default: - throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval."); - } - - if (overlap && originalRead) - intervalOverlapped = interval; - - if (clippedRead != null) { - originalRead = false; - - if (!clippedRead.isEmpty()) - clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off - } - - if (doneClipping) - break; - } - - if (intervalOverlapped != null) - intervalList = intervalList.tailSet(intervalOverlapped); - - return clippedReads; - } - - /** - * Compresses the read name and adds it to output BAM file (reduced BAM) - * after performing some quality control - * - * @param read any read - */ - private void outputRead(GATKSAMRecord read) { - if (debugLevel == 2) { - checkForHighMismatch(read); - checkCigar(read); - } - - if (read.isReducedRead()) - nCompressedReads++; - else { - int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); - int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); - - int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression - int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression - - if (startShift > 0) - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start) - if (endShift > 0) - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end) - } - - if (debugLevel == 1) - System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd()); - - if (!DONT_COMPRESS_READ_NAMES) - nextReadNumber = compressReadName(readNameHash, read, nextReadNumber); - - writerToUse.addAlignment(read); - } - - /** - * Quality control procedure that checks if the consensus reads contains too many - * mismatches with the reference. This should never happen and is a good trigger for - * errors with the algorithm. - * - * @param read any read - */ - private void checkForHighMismatch(GATKSAMRecord read) { - final int start = read.getAlignmentStart(); - final int stop = read.getAlignmentEnd(); - final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases(); - final int nm = SequenceUtil.countMismatches(read, ref, start - 1); - final int readLen = read.getReadLength(); - final double nmFraction = nm / (1.0 * readLen); - if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus")) - throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); - } - - private void checkCigar (GATKSAMRecord read) { - if (read.getCigar().isValid(null, -1) != null) { - throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString()); - } - - } - - - /** - * Compresses the read name using the readNameHash if we have already compressed - * this read name before. - * - * @param hash the hash table containing the read name to compressed read name map - * @param read any read - * @param nextReadNumber the number to use in the compressed read name in case this is a new read name - * @return the next number to use in the compressed read name - */ - protected static long compressReadName(final Object2LongOpenHashMap hash, final GATKSAMRecord read, final long nextReadNumber) { - final String name = read.getReadName(); - final StringBuilder compressedName = new StringBuilder(); - long result = nextReadNumber; - if (read.isReducedRead()) { - compressedName.append("C"); - } - final Long readNumber = hash.get(name); - if (readNumber != null) { - compressedName.append(readNumber); - } else { - hash.put(name, nextReadNumber); - compressedName.append(nextReadNumber); - result++; - } - read.setReadName(compressedName.toString()); - return result; - } - - /** - * Returns true if the read is the original read that went through map(). - * - * This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled. - * - * @param list the list - * @param read the read - * @return Returns true if the read is the original read that went through map(). - */ - private boolean isOriginalRead(ObjectArrayList list, GATKSAMRecord read) { - return isWholeGenome() || list.get(0).equals(read); - } - - /** - * Checks whether or not the intervalList is empty, meaning we're running in WGS mode. - * - * @return whether or not we're running in WGS mode. - */ - private boolean isWholeGenome() { - return intervalList.isEmpty(); - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java deleted file mode 100644 index 52c5f0903..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java +++ /dev/null @@ -1,160 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.ObjectSortedSet; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.LinkedList; -import java.util.List; -import java.util.SortedSet; -import java.util.TreeSet; - -/** - * This class implements a "read stash" that keeps reads always sorted in alignment order. Useful - * for read walkers that alter the alignment information of the incoming reads, but need to - * maintain the reads sorted for the reduce step. (e.g. ReduceReads) - */ - -public class ReduceReadsStash { - protected MultiSampleCompressor compressor; - SortedSet outOfOrderReads; - - /** - * Creates a stash with the default sorting order (read alignment) - * @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close()) - */ - public ReduceReadsStash(MultiSampleCompressor compressor) { - this.compressor = compressor; - this.outOfOrderReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); - } - - /** - * Get all reads before a given read (for processing) - * - * @param read the original read - * @return all reads that have alignment start before the original read. - */ - public List getAllReadsBefore(GATKSAMRecord read) { - List result = new LinkedList(); - GATKSAMRecord newHead = null; - - for (GATKSAMRecord stashedRead : outOfOrderReads) { - if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0) - result.add(stashedRead); - else { - newHead = stashedRead; - break; - } - } - - if (result.size() > 0) { - if (result.size() == outOfOrderReads.size()) - outOfOrderReads.clear(); - else - outOfOrderReads = new TreeSet(outOfOrderReads.tailSet(newHead)); - } - - return result; - } - - /** - * sends the read to the MultiSampleCompressor - * - * @param read the read to be compressed - * @param knownSnpPositions the set of known SNP positions - * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) - */ - public Iterable compress(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { - return compressor.addAlignment(read, knownSnpPositions); - } - - /** - * Add a read to the stash - * - * @param read any read - */ - public void add(GATKSAMRecord read) { - outOfOrderReads.add(read); - } - - /** - * Close the stash, processing all remaining reads in order - * - * @param knownSnpPositions the set of known SNP positions - * @return a list of all the reads produced by the SlidingWindow machinery) - */ - public Iterable close(final ObjectSortedSet knownSnpPositions) { - LinkedList result = new LinkedList(); - - // compress all the stashed reads (in order) - for (GATKSAMRecord read : outOfOrderReads) - for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions)) - result.add(compressedRead); - - // output any remaining reads from the compressor - for (GATKSAMRecord read : compressor.close(knownSnpPositions)) - result.add(read); - - return result; - } - - /** - * Useful debug functionality, outputs all elements in the stash - */ - public void print() { - int i = 1; - System.out.println("Stash Contents:"); - for (GATKSAMRecord read : outOfOrderReads) - System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd())); - System.out.println(); - } - -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java deleted file mode 100644 index 61c34b6a0..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ /dev/null @@ -1,153 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import it.unimi.dsi.fastutil.objects.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * - * @author carneiro, depristo - * @version 3.0 - */ -public class SingleSampleCompressor { - final private int contextSize; - final private int downsampleCoverage; - final private int minMappingQuality; - final private double minAltPValueToTriggerVariant; - final private double minAltProportionToTriggerVariant; - final private double minIndelProportionToTriggerVariant; - final private int minBaseQual; - final private ReduceReads.DownsampleStrategy downsampleStrategy; - - private SlidingWindow slidingWindow; - private int slidingWindowCounter; - - public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new ObjectAVLTreeSet(), new CompressionStash()); - - public SingleSampleCompressor(final int contextSize, - final int downsampleCoverage, - final int minMappingQuality, - final double minAltPValueToTriggerVariant, - final double minAltProportionToTriggerVariant, - final double minIndelProportionToTriggerVariant, - final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { - this.contextSize = contextSize; - this.downsampleCoverage = downsampleCoverage; - this.minMappingQuality = minMappingQuality; - this.slidingWindowCounter = 0; - this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant; - this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant; - this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; - this.minBaseQual = minBaseQual; - this.downsampleStrategy = downsampleStrategy; - } - - /** - * Add an alignment to the compressor - * - * @param read the read to be added - * @param knownSnpPositions the set of known SNP positions - * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) - */ - public Pair, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions ) { - ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - CompressionStash stash = new CompressionStash(); - int readOriginalStart = read.getUnclippedStart(); - - // create a new window if: - if ((slidingWindow != null) && - ( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig - (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window - - // close the current sliding window - Pair, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions); - reads = readsAndStash.getFirst(); - stash = readsAndStash.getSecond(); - slidingWindow = null; // so we create a new one on the next if - } - - if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), - slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, - minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); - slidingWindowCounter++; - } - - stash.addAll(slidingWindow.addRead(read)); - return new Pair, CompressionStash>(reads, stash); - } - - /** - * Properly closes the compressor. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { - return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair; - } - - /** - * Finalizes current variant regions. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { - return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions); - } - -} - diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java deleted file mode 100644 index d5aa8f944..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ /dev/null @@ -1,1110 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.bytes.Byte2IntArrayMap; -import it.unimi.dsi.fastutil.bytes.Byte2IntMap; -import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.*; - - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 8/3/11 - * Time: 2:24 PM - */ -public class SlidingWindow { - - // Sliding Window data - final protected PriorityQueue readsInWindow; - final protected LinkedList windowHeader; - protected int contextSize; // the largest context size (between mismatches and indels) - protected String contig; - protected int contigIndex; - protected SAMFileHeader samHeader; - protected GATKSAMReadGroupRecord readGroupAttribute; - protected int downsampleCoverage; - - // Running consensus data - protected int consensusCounter; - protected String consensusReadName; - - // Filtered Data Consensus data - protected int filteredDataConsensusCounter; - protected String filteredDataReadName; - - // Additional parameters - protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches - protected double MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches - protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions - protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value - protected int MIN_MAPPING_QUALITY; - - protected ReduceReads.DownsampleStrategy downsampleStrategy; - private boolean hasIndelQualities; - - private static CompressionStash emptyRegions = new CompressionStash(); - - /** - * The types of synthetic reads - */ - protected enum ConsensusType { - POSITIVE_CONSENSUS, - NEGATIVE_CONSENSUS, - FILTERED - } - - public int getStopLocation() { - return getStopLocation(windowHeader); - } - - private int getStopLocation(final LinkedList header) { - return header.isEmpty() ? -1 : header.peekLast().getLocation(); - } - - public String getContig() { - return contig; - } - - public int getContigIndex() { - return contigIndex; - } - - public int getStartLocation(final LinkedList header) { - return header.isEmpty() ? -1 : header.peek().getLocation(); - } - - // for testing only - protected SlidingWindow(final String contig, final int contigIndex, final int startLocation) { - this.contig = contig; - this.contigIndex = contigIndex; - - contextSize = 10; - - this.windowHeader = new LinkedList<>(); - windowHeader.addFirst(new HeaderElement(startLocation)); - this.readsInWindow = new PriorityQueue<>(100, new Comparator() { - @Override - public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { - return read1.getSoftEnd() - read2.getSoftEnd(); - } - }); - } - - public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader, - final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber, - final double minAltPValueToTriggerVariant, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, - final int minBaseQual, final int minMappingQuality, final int downsampleCoverage, - final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) { - this.contextSize = contextSize; - this.downsampleCoverage = downsampleCoverage; - - this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant; - this.MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant; - this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant; - this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; - this.MIN_MAPPING_QUALITY = minMappingQuality; - - this.windowHeader = new LinkedList<>(); - this.readsInWindow = new PriorityQueue<>(1000, new Comparator() { - @Override - public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { - return read1.getSoftEnd() - read2.getSoftEnd(); - } - }); - - this.contig = contig; - this.contigIndex = contigIndex; - this.samHeader = samHeader; - this.readGroupAttribute = readGroupAttribute; - - this.consensusCounter = 0; - this.consensusReadName = "Consensus-" + windowNumber + "-"; - - this.filteredDataConsensusCounter = 0; - this.filteredDataReadName = "Filtered-" + windowNumber + "-"; - - this.downsampleStrategy = downsampleStrategy; - this.hasIndelQualities = hasIndelQualities; - } - - /** - * Add a read to the sliding window and slides the window accordingly. - * - * Reads are assumed to be in order, therefore, when a read is added the sliding window can - * assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window - * slides forward to that position and returns all reads that may have been finalized in the - * sliding process. - * - * @param read the read - * @return a non-null list of reads (in the CompressionStash) that have been finished by sliding the window. - */ - @Requires({"read != null"}) - @Ensures("result != null") - public CompressionStash addRead(GATKSAMRecord read) { - addToHeader(windowHeader, read); // update the window header counts - // no need to track low mapping quality reads - if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) - readsInWindow.add(read); // add read to sliding reads - return slideWindow(read.getUnclippedStart()); - } - - /** - * Returns the next complete (or incomplete if closeLastRegion is true) variant region between 'from' (inclusive) and 'to' (exclusive) - * but converted to global coordinates. - * - * @param from beginning window header index of the search window (inclusive) in local (to the windowHeader) coordinates - * @param to end window header index of the search window (exclusive) in local (to the windowHeader) coordinates - * @param variantSite boolean array with true marking variant regions - * @param closeLastRegion if the last index is variant (so it's an incomplete region), should we close (and return as an interval) the location or ignore it? - * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. All coordinates returned are global. - */ - @Requires({"from >= 0", "from <= to", "to <= variantSite.length"}) - private FinishedGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean closeLastRegion) { - boolean foundStart = false; - final int windowHeaderStart = getStartLocation(windowHeader); - int variantRegionStartIndex = 0; - for (int i=from; i= 0", "from <= to", "to <= variantSite.length"}) - @Ensures("result != null") - protected CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean closeLastRegion) { - final int windowHeaderStart = getStartLocation(windowHeader); - - CompressionStash regions = new CompressionStash(); - int index = from; - while(index < to) { - // returns results in global coordinates - FinishedGenomeLoc result = findNextVariantRegion(index, to, variantSite, closeLastRegion); - if (result == null) - break; - - regions.add(result); - if (!result.isFinished()) - break; - - index = result.getStop() - windowHeaderStart + 1; // go back to local coordinates - } - return regions; - } - - /** - * Determines if the window can be slid given the new incoming read. - * - * We check from the start of the window to the (unclipped) start of the new incoming read if there - * is any variant. - * If there are variant sites, we check if it's time to close the variant region. - * - * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start! - * @return all reads that have fallen to the left of the sliding window after the slide - */ - protected CompressionStash slideWindow(final int incomingReadUnclippedStart) { - final int windowHeaderStartLocation = getStartLocation(windowHeader); - CompressionStash regions = emptyRegions; - boolean forceClose = true; - - if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) { - markSites(incomingReadUnclippedStart); - int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation; - int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - - regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); - } - - while (!readsInWindow.isEmpty() && readsInWindow.peek().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.poll(); - } - - return regions; - } - - - protected final class MarkedSites { - - private boolean[] siteIsVariant = new boolean[0]; - private int startLocation = 0; - - public MarkedSites() {} - - public boolean[] getVariantSiteBitSet() { return siteIsVariant; } - - protected int getStartLocation() { return startLocation; } - - /** - * Updates the variant site bitset given the new startlocation and size of the region to mark. - * - * @param newStartLocation the new start location of the bitset - * @param sizeOfRegion the new size of the region to be represented - * - * @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder. - */ - public int updateRegion(final int newStartLocation, final int sizeOfRegion) { - int lastPositionMarked = sizeOfRegion; - - // if this is the first time we set the array and we can't reuse anything, just create a new array from scratch - if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) { - siteIsVariant = new boolean[sizeOfRegion]; - lastPositionMarked = 0; - } - // if the dimensions change, copy what we can and continue - else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) { - final boolean[] tempArray = new boolean[sizeOfRegion]; - final int differenceInStartPositions = newStartLocation - this.startLocation; - lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion); - System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked); - siteIsVariant = null; // explicitly allow garbage collection - siteIsVariant = tempArray; - } - - this.startLocation = newStartLocation; - - return lastPositionMarked + newStartLocation; - } - } - - private final MarkedSites markedSites = new MarkedSites(); - - /** - * returns the MarkedSites object so that it can be tested after adding data to the Sliding Window - * - * @return the Marked Sites object used by this Sliding Window - */ - protected MarkedSites getMarkedSitesForTesting() { return markedSites; } - - /** - * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks) - * - * @param stop check the window from start to stop (not-inclusive); given in global coordinates - */ - protected void markSites(final int stop) { - - final int windowHeaderStartLocation = getStartLocation(windowHeader); - final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1; - - // copy over as many bits as we can from the previous calculation. Note that we can't trust the - // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there. - final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1; - final int locationToProcess = Math.max(windowHeaderStartLocation, Math.min(lastPositionMarked, stop - contextSize)); - - final ListIterator headerElementIterator = windowHeader.listIterator(locationToProcess - windowHeaderStartLocation); - - // process a contextSize worth of region from scratch in case there's a variant there - for (int i = locationToProcess; i < stop; i++) { - if (headerElementIterator.hasNext()) { - HeaderElement headerElement = headerElementIterator.next(); - - if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) - markVariantRegion(i - windowHeaderStartLocation); - - } else - break; - } - } - - /** - * Marks the sites around the variant site (as true) - * - * @param variantSiteLocation the location where a variant site was found - */ - protected void markVariantRegion(final int variantSiteLocation) { - int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; - int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize; - markRegionAs(from, to, true); - } - - /** - * Marks the sites around the variant site (as true) - * - * @param from the start index (inclusive) to mark - * @param to the end index (inclusive) to mark - * @param isVariant mark the region with this boolean value - */ - private void markRegionAs(final int from, final int to, final boolean isVariant) { - for (int i = from; i <= to; i++) - markedSites.getVariantSiteBitSet()[i] = isVariant; - } - - /** - * Adds bases to the running consensus - * - * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus - * - * @param header the header to use - * @param start the first header index to add to consensus - * @param end the first header index NOT TO add to consensus - * @param consensusType the consensus type to use - * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. - */ - @Requires({"start >= 0 && (end >= start || end == 0)"}) - @Ensures("result != null") - protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final ConsensusType consensusType) { - final ObjectArrayList reads = new ObjectArrayList<>(); - - SyntheticRead consensus = null; - final ListIterator headerElementIterator = header.listIterator(start); - boolean wasInConsensus = false; - - for ( int currentPosition = start; currentPosition < end; currentPosition++ ) { - - if ( ! headerElementIterator.hasNext() ) - throw new IllegalStateException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end)); - final HeaderElement headerElement = headerElementIterator.next(); - - if ( headerElement.hasConsensusData(consensusType) ) { - wasInConsensus = true; - - // add to running consensus - if ( consensus == null ) - consensus = createNewConsensus(consensusType, headerElement.getLocation()); - - genericAddBaseToConsensus(consensus, headerElement.getBaseCounts(consensusType)); - - } else { - - // add any outstanding consensus data - if ( wasInConsensus ) { - reads.addAll(finalizeAndAdd(consensus, consensusType)); - consensus = null; - } - - wasInConsensus = false; - } - } - - // add any outstanding consensus data - reads.addAll(finalizeAndAdd(consensus, consensusType)); - - return reads; - } - - private SyntheticRead createNewConsensus(final ConsensusType consensusType, final int start) { - if ( consensusType == ConsensusType.FILTERED ) - return new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, start, hasIndelQualities, SyntheticRead.StrandType.STRANDLESS); - return new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, start, hasIndelQualities, consensusType == ConsensusType.POSITIVE_CONSENSUS ? SyntheticRead.StrandType.POSITIVE : SyntheticRead.StrandType.NEGATIVE); - } - - /** - * Finalizes a synthetic read. - * - * @param consensus the consensus to finalize - * @param type the synthetic reads you want to close - * @return a possibly empty list of GATKSAMRecords generated by finalizing the synthetic reads - */ - private ObjectArrayList finalizeAndAdd(final SyntheticRead consensus, final ConsensusType type) { - - final ObjectArrayList list = new ObjectArrayList<>(); - - final GATKSAMRecord read; - if ( type == ConsensusType.FILTERED ) - read = finalizeFilteredDataConsensus(consensus); - else - read = finalizeRunningConsensus(consensus); - - if ( read != null ) - list.add(read); - - return list; - } - - /** - * Generic accessor to add base and qualities to a synthetic read - * - * @param syntheticRead the synthetic read to add to - * @param baseCounts the base counts object in the header element - */ - private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) { - final BaseIndex base = baseCounts.baseIndexWithMostProbability(); - final int count = baseCounts.countOfBase(base); - final byte qual = baseCounts.averageQualsOfBase(base); - final byte insQual = baseCounts.averageInsertionQualsOfBase(base); - final byte delQual = baseCounts.averageDeletionQualsOfBase(base); - syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS()); - } - - /** - * Method to compress a variant region and return the associated reduced reads - * - * @param start the first window header index in the variant region (inclusive) - * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) - * @return a non-null object representing all reads contained in the variant region - */ - @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures("result != null") - protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop); - - // Try to compress into a polyploid consensus - // Optimization: don't bother if there are no known SNPs here - final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop); - - // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by - // insertions (which we don't want because we can't confirm that they represent the same allele). - // Also, we only allow polyploid consensus creation at known sites if provided. - if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) { - // try to create the polyploid consensus - allReads.reads.addAll(createPolyploidConsensus(hetRefPosition)); - allReads.stopPerformed = hetRefPosition; // we stopped at the het position - } - // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them - // from the window header entirely; also remove all reads preceding the variant region (since they will be output - // as consensus right after compression) - else { - final int refStart = windowHeader.get(start).getLocation(); - final int refStop = windowHeader.get(stop).getLocation(); - - final ObjectList toRemoveFromWindow = new ObjectArrayList<>(); - final ObjectList toEmit = new ObjectArrayList<>(); - for ( final GATKSAMRecord read : readsInWindow ) { - if ( read.getSoftStart() <= refStop ) { - if ( read.getAlignmentEnd() >= refStart ) { - toEmit.add(read); - removeFromHeader(windowHeader, read); - } - toRemoveFromWindow.add(read); - } - } - - // remove all used reads - for ( final GATKSAMRecord read : toRemoveFromWindow ) - readsInWindow.remove(read); - - // down-sample the unreduced reads if needed - allReads.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(toEmit) : toEmit); - } - - return allReads; - } - - /** - * Determines whether the given position match one of the known sites - * - * @param targetPosition the position of the het site - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) - * @return true if the targetPosition matches a known SNP position, false otherwise - */ - @Requires({"targetPosition >= 1 && knownSnpPositions != null"}) - protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) { - final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition); - return knownSnpPositions == null || knownSnpPositions.contains(targetLoc); - } - - /* - * Finds the het variant position located within start and stop (inclusive) if one exists. - * - * @param start the first header index in the region to check (inclusive) - * @param stop the last header index of the region to check (inclusive) - * @return the window header index of the single het position or -1 if either none or more than one exists - */ - @Requires("start >= 0 && (stop >= start || stop == 0)") - protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) { - int hetRefPosition = -1; - - for ( int i = start; i <= stop; i++ ) { - - final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT); - - // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios - if ( nAlleles > 2 || nAlleles == -1 ) - return -1; - - if ( nAlleles == 2 ) { - - // make sure that there is only 1 site in the region that contains more than one allele - if ( hetRefPosition != -1 ) - return -1; - - hetRefPosition = i; - } - } - - return hetRefPosition; - } - - /* - * Checks whether there's a position in the header with a significant number of softclips or a variant. - * - * @param header the window header to examine - * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument) - * @return true if there exists a position with significant softclips, false otherwise - */ - @Requires("header != null") - protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List header, final int positionToSkip) { - - for ( final HeaderElement headerElement : header ) { - - if ( headerElement.getLocation() == positionToSkip ) - continue; - - if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) || - headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) != 1 ) - return true; - } - - return false; - } - - /** - * Finalizes a variant region, any adjacent synthetic reads. - * - * @param start the first window header index in the variant region (inclusive) - * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) - * @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads - */ - @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures("result != null") - protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions); - allReads.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1)); - return allReads; - } - - /** - * Adds reads for all possible strands (positive, negative, filtered) from the global windowHeader object - * - * @param start the start position (inclusive) - * @param end the end position (exclusive) - * @return non-null but possibly empty array list with reduced reads - */ - private ObjectArrayList addAllSyntheticReadTypes(final int start, final int end) { - final ObjectArrayList reads = new ObjectArrayList<>(); - reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.POSITIVE_CONSENSUS)); - reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.NEGATIVE_CONSENSUS)); - reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.FILTERED)); - return reads; - } - - /* - * @see #closeVariantRegions(CompressionStash, ObjectSortedSet, boolean) with forceCloseFullRegions set to false - */ - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { - return closeVariantRegions(regions, knownSnpPositions, false); - } - - private static final class CloseVariantRegionResult { - final private ObjectList reads = new ObjectArrayList<>(); - private int stopPerformed; - - public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; } - } - - /* - * Finalizes the list of regions requested (and any regions preceding them) - * - * @param regions the list of regions to finalize - * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) - * @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression) - * @return a non-null set of reduced reads representing the finalized regions - */ - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) { - final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet<>(new AlignmentStartWithNoTiesComparator()); - if ( !regions.isEmpty() ) { - - int windowHeaderStart = getStartLocation(windowHeader); - HeaderElement lastCleanedElement = null; - - for ( final GenomeLoc region : regions ) { - if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { - final int start = region.getStart() - windowHeaderStart; - int stop = region.getStop() - windowHeaderStart; - - // make sure the bitset is complete given the region (it might not be in multi-sample mode) - if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length - 1 ) - markSites(region.getStop()); - - CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions); - allReads.addAll(closeVariantRegionResult.reads); - - // check whether we didn't close the whole region that was requested - if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) { - // we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant" - markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false); - - // if the calling method said that it didn't care then we are okay so update the stop - if ( !forceCloseFullRegions ) { - stop = closeVariantRegionResult.stopPerformed; - } - // otherwise, we need to forcibly push the stop that we originally requested - else { - while ( closeVariantRegionResult.stopPerformed < stop ) { - // first clean up used header elements so they don't get reused - for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ ) - windowHeader.remove(); - stop -= (closeVariantRegionResult.stopPerformed + 1); - - closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions); - allReads.addAll(closeVariantRegionResult.reads); - } - } - } - - // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. - // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads - // (in the case where there are multiple regions to close we'd reuse the reads for each region). - if ( stop >= 0 ) { - for ( int i = 0; i < stop; i++ ) - windowHeader.remove(); - lastCleanedElement = windowHeader.remove(); - windowHeaderStart = getStartLocation(windowHeader); - } - } - } - - // we need to keep the last element of the last cleaned region in the event that the following element has a read that starts with an insertion. - if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() ) - windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight())); - } - - return allReads; - } - - /** - * Downsamples a variant region to the downsample coverage of the sliding window. - * - * It will use the downsampling strategy defined by the SlidingWindow - * - * @param allReads a non-null list of reads to select from (all reads that cover the window) - * @return a non-null list of reads selected by the downsampler to cover the window to at least the desired coverage - */ - @Requires({"allReads != null"}) - @Ensures("result != null") - protected ObjectList downsampleVariantRegion(final ObjectList allReads) { - int nReads = allReads.size(); - if (nReads == 0) - return allReads; - - if (downsampleCoverage >= nReads) - return allReads; - - ReservoirDownsampler downsampler = new ReservoirDownsampler<>(downsampleCoverage); - downsampler.submit(allReads); - return new ObjectArrayList<>(downsampler.consumeFinalizedItems()); - } - - - /** - * Properly closes a Sliding Window, finalizing all consensus and variant - * regions that still exist regardless of being able to fulfill the - * context size requirement in the end. - * - * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { - // mark variant regions - ObjectSet finalizedReads = new ObjectAVLTreeSet<>(new AlignmentStartWithNoTiesComparator()); - CompressionStash regions = new CompressionStash(); - - if (!windowHeader.isEmpty()) { - markSites(getStopLocation(windowHeader) + 1); - regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true); - finalizedReads = closeVariantRegions(regions, knownSnpPositions, true); - - if (!windowHeader.isEmpty()) - finalizedReads.addAll(addAllSyntheticReadTypes(0, windowHeader.size())); - } - - return new Pair<>(finalizedReads, regions); - } - - /** - * generates the SAM record for the running consensus read and resets it (to null) - * - * @param runningConsensus the consensus to finalize - * @return the read contained in the running consensus or null - */ - protected GATKSAMRecord finalizeRunningConsensus(final SyntheticRead runningConsensus) { - GATKSAMRecord finalizedRead = null; - - if ( runningConsensus != null ) { - if ( runningConsensus.size() > 0 ) - finalizedRead = runningConsensus.close(); - else - consensusCounter--; - } - - return finalizedRead; - } - - /** - * generates the SAM record for the filtered data consensus and resets it (to null) - * - * @param filteredDataConsensus the consensus to finalize - * @return the read contained in the running consensus or null - */ - protected GATKSAMRecord finalizeFilteredDataConsensus(final SyntheticRead filteredDataConsensus) { - GATKSAMRecord finalizedRead = null; - if (filteredDataConsensus != null) { - if (filteredDataConsensus.size() > 0) - finalizedRead = filteredDataConsensus.close(); - else - filteredDataConsensusCounter--; - } - return finalizedRead; - } - - // define this so that we can use Java generics below - private final static class HeaderElementList extends LinkedList {} - - private final static class SingleStrandConsensusData { - final HeaderElementList consensus = new HeaderElementList(); - final ObjectList reads = new ObjectArrayList<>(); - } - - /** - * Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not - * supported) with polyploid compression. - * - * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE! - * @return a non-null list of all reads contained in the variant region as a polyploid consensus - */ - @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures({"result != null"}) - protected ObjectList createPolyploidConsensus(final int hetRefPosition) { - // we will create two (positive strand, negative strand) headers for each haplotype - final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2]; - final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2]; - - final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation(); - - // initialize the mapping from base (allele) to header - final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); - alleleHeaderMap.defaultReturnValue(-1); - for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ) { - final int currentIndex = alleleHeaderMap.size(); - if ( currentIndex > 1 ) - throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); - - alleleHeaderMap.put(allele.b, currentIndex); - headersPosStrand[currentIndex] = new SingleStrandConsensusData(); - headersNegStrand[currentIndex] = new SingleStrandConsensusData(); - } - - // sanity check that we saw 2 alleles - if ( alleleHeaderMap.size() != 2 ) - throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); - - final ObjectList readsToRemove = new ObjectArrayList<>(); - - for ( final GATKSAMRecord read : readsInWindow ) { - - // if the read falls after the het position, just skip it for now (we'll get to it later) - if ( read.getSoftStart() > globalHetRefPosition ) - continue; - - // remove all other reads from the read cache since we're going to use them here - readsToRemove.add(read); - - // if the read falls before the het position or has low MQ, we don't need to look at it - if ( read.getSoftEnd() < globalHetRefPosition || read.getMappingQuality() < MIN_MAPPING_QUALITY) - continue; - - // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead - removeFromHeader(windowHeader, read); - - // where on the read is the het position? - final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - - // this is safe because indels are not supported - final byte base = read.getReadBases()[readPosOfHet]; - - // check which allele this read represents - final int allele = alleleHeaderMap.get(base); - - // ignore the read if it represents a base that's not part of the consensus - if ( allele != -1 ) { - // add to the appropriate polyploid header - final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; - header.reads.add(read); - addToHeader(header.consensus, read); - } - } - - for ( final GATKSAMRecord read : readsToRemove ) - readsInWindow.remove(read); - - // create the polyploid synthetic reads if we can - final ObjectList hetReads = new ObjectArrayList<>(); - - // sanity check that no new "variant region" exists on just a single consensus strand due to softclips - // or multi-allelic sites now that we've broken everything out into their component parts. if one does - // exist then we need to back out the consensus for that strand only. - for ( final SingleStrandConsensusData header : headersPosStrand ) { - if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) - hetReads.addAll(header.reads); - else - finalizeHetConsensus(header.consensus, false, hetReads); - } - for ( final SingleStrandConsensusData header : headersNegStrand ) { - if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) - hetReads.addAll(header.reads); - else - finalizeHetConsensus(header.consensus, true, hetReads); - } - - return hetReads; - } - - /* - * Finalizes a particular het consensus for the given header representation - * - * @param header the list of header elements representing the header for the consensus - * @param isNegativeStrand does this header represent reads on the negative strand? - * @param result list in which to store results - */ - protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) { - if ( header.size() > 0 ) { - if ( isNegativeStrand ) - result.addAll(addToSyntheticReads(header, 0, header.size(), ConsensusType.NEGATIVE_CONSENSUS)); - else - result.addAll(addToSyntheticReads(header, 0, header.size(), ConsensusType.POSITIVE_CONSENSUS)); - } - } - - private void addToHeader(LinkedList header, GATKSAMRecord read) { - updateHeaderCounts(header, read, false); - } - - private void removeFromHeader(LinkedList header, GATKSAMRecord read) { - updateHeaderCounts(header, read, true); - } - - /** - * Updates the sliding window's header counts with the incoming read bases, insertions - * and deletions. - * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param removeRead if we are removing the read from the header or adding - */ - protected void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) { - final int readStart = read.getSoftStart(); - final int headerStart = getStartLocation(header); - int locationIndex = headerStart < 0 ? 0 : readStart - headerStart; - - if ( removeRead && locationIndex < 0 ) - throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); - - // we only need to create new header elements if we are adding the read, not when we're removing it - if ( !removeRead ) - locationIndex = createNewHeaderElements(header, read, locationIndex); - - actuallyUpdateHeaderForRead(header, read, removeRead, locationIndex); - } - - /* - * Creates new header elements if needed for the given read. - * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param startIndex the start location index into the header for this read - * - * @return an updated index into the modified header - */ - @Requires("header != null && read != null") - protected int createNewHeaderElements(final LinkedList header, final GATKSAMRecord read, final int startIndex) { - - int headerStart = getStartLocation(header); - int locationIndex = startIndex; - - // Do we need to add extra elements before the start of the header? This could happen if the previous read was - // clipped and this alignment starts before the beginning of the window - final int readStart = read.getSoftStart(); - if ( startIndex < 0 ) { - for ( int i = 1; i <= -startIndex; i++ ) - header.addFirst(new HeaderElement(headerStart - i)); - - // update the start location accordingly - headerStart = readStart; - locationIndex = 0; - } - - // Do we need to add extra elements to the end of the header? - final int headerStop = getStopLocation(header); - final int readEnd = read.getSoftEnd(); - if ( headerStop < readEnd ) { - final int elementsToAdd = (headerStop < 0) ? readEnd - readStart + 1 : readEnd - headerStop; - for ( int i = elementsToAdd - 1; i >= 0; i-- ) - header.addLast(new HeaderElement(readEnd - i)); - } - - // Special case for leading insertions before the beginning of the sliding read - if ( (readStart == headerStart || headerStart < 0) && ReadUtils.readStartsWithInsertion(read.getCigar(), false) != null ) { - // create a new first element to the window header with no bases added - header.addFirst(new HeaderElement(readStart - 1)); - // this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing - locationIndex = 1; - } - - return locationIndex; - } - - /* - * Actually updates the sliding window's header counts with the incoming read bases and quals (including insertion and deletion quals). - * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param removeRead if we are removing the read from the header or adding - * @param startIndex the start location index into the header for this read - */ - @Requires("header != null && read != null && startIndex >= 0") - protected void actuallyUpdateHeaderForRead(final LinkedList header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) { - - final Iterator headerElementIterator = header.listIterator(startIndex); - final int mappingQuality = read.getMappingQuality(); - final boolean isNegativeStrand = read.getReadNegativeStrandFlag(); - - // iterator variables - int locationIndex = startIndex; - int readBaseIndex = 0; - HeaderElement headerElement; - - for ( final CigarElement cigarElement : read.getCigar().getCigarElements() ) { - switch ( cigarElement.getOperator() ) { - case H: - break; - case I: - readBaseIndex += cigarElement.getLength(); - - // special case, if we don't have the previous header element anymore, don't worry about it. - if ( locationIndex == 0 ) - break; - - // insertions are added to the base to the left (previous element) - headerElement = header.get(locationIndex - 1); - - if ( removeRead ) - headerElement.removeInsertionToTheRight(); - else - headerElement.addInsertionToTheRight(); - - break; - case D: - // deletions are added to the baseCounts with the read mapping quality as its quality score - final int nDeletionBases = cigarElement.getLength(); - final byte MQbyte = mappingQuality > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)mappingQuality; - for ( int i = 0; i < nDeletionBases; i++ ) { - headerElement = headerElementIterator.next(); - if (removeRead) - headerElement.removeBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand); - else - headerElement.addBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand); - } - locationIndex += nDeletionBases; - break; - case S: - case M: - case P: - case EQ: - case X: - final int nBasesToAdd = cigarElement.getLength(); - final boolean isSoftClip = cigarElement.getOperator() == CigarOperator.S; - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final boolean readHasIndelQuals = read.hasBaseIndelQualities(); - final byte[] insertionQuals = readHasIndelQuals ? read.getBaseInsertionQualities() : null; - final byte[] deletionQuals = readHasIndelQuals ? read.getBaseDeletionQualities() : null; - - for ( int i = 0; i < nBasesToAdd; i++ ) { - headerElement = headerElementIterator.next(); - final byte insertionQuality = readHasIndelQuals ? insertionQuals[readBaseIndex] : -1; - final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1; - - if ( removeRead ) - headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip, isNegativeStrand); - else - headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip, isNegativeStrand); - - readBaseIndex++; - } - locationIndex += nBasesToAdd; - break; - default: - break; - } - } - } -} - diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java deleted file mode 100644 index 9d16ea06f..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - - -/** - * Running Consensus is a read that is compressed as a sliding window travels over the reads - * and keeps track of all the bases that are outside of variant regions. - * - * Consensus reads have qual fields that correspond to the number of reads that had the base - * and passed the minimum quality threshold. - * - * The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads - * that compose the consensus - * - * @author Mauricio Carneiro - * @since 8/26/11 - */ -public class SyntheticRead { - - /** - * The types of strandedness for synthetic reads - */ - public enum StrandType { - POSITIVE, - NEGATIVE, - STRANDLESS - } - - // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint. - private static class SingleBaseInfo { - byte baseIndexOrdinal; // enum BaseIndex.ordinal - int count; - byte qual; - byte insertionQual; - byte deletionQual; - - SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) { - this.baseIndexOrdinal = baseIndexOrdinal; - this.count = count; - this.qual = qual; - this.insertionQual = insertionQual; - this.deletionQual = deletionQual; - } - } - - // This class is merely sharing of code for convertVariableGivenBases(). - private abstract class SingleBaseInfoIterator implements Iterator { - final Iterator it; - - SingleBaseInfoIterator() { - this.it = basesCountsQuals.iterator(); - } - - public boolean hasNext() { - return it.hasNext(); - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - - - // Map from ordinal to enum value. - private static final BaseIndex[] BaseIndexByOrdinal = new BaseIndex[BaseIndex.values().length]; - static - { - for (final BaseIndex baseIndex : BaseIndex.values()) { - BaseIndexByOrdinal[baseIndex.ordinal()] = baseIndex; - } - } - - - private final ObjectArrayList basesCountsQuals; - private double mappingQuality; - - // Information to produce a GATKSAMRecord - private SAMFileHeader header; - private GATKSAMReadGroupRecord readGroupRecord; - private String contig; - private int contigIndex; - private String readName; - private int refStart; - private boolean hasIndelQualities = false; - private StrandType strandType = StrandType.STRANDLESS; - - /** - * Full initialization of the running consensus if you have all the information and are ready to - * start adding to the running consensus. - * - * @param header GATKSAMRecord file header - * @param readGroupRecord Read Group for the GATKSAMRecord - * @param contig the read's contig name - * @param contigIndex the read's contig index - * @param readName the read's name - * @param refStart the alignment start (reference based) - */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { - final int initialCapacity = 10000; - basesCountsQuals = new ObjectArrayList(initialCapacity); - mappingQuality = 0.0; - - this.header = header; - this.readGroupRecord = readGroupRecord; - this.contig = contig; - this.contigIndex = contigIndex; - this.readName = readName; - this.refStart = refStart; - this.hasIndelQualities = hasIndelQualities; - this.strandType = strandType; - } - - /** - * Easy access to keep adding to a running consensus that has already been - * initialized with the correct read name and refStart - * - * @param base the base to add - * @param count number of reads with this base - */ - @Requires("count <= Byte.MAX_VALUE") - public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) { - basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual)); - this.mappingQuality += mappingQuality; - } - - public BaseIndex getBase(final int readCoordinate) { - return BaseIndexByOrdinal[basesCountsQuals.get(readCoordinate).baseIndexOrdinal]; - } - - public int getRefStart() { - return refStart; - } - - /** - * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid. - * - * Invalid reads are : - * - exclusively composed of deletions - * - * @return a GATKSAMRecord or null - */ - public GATKSAMRecord close () { - if (isAllDeletions()) - return null; - - GATKSAMRecord read = new GATKSAMRecord(header); - read.setReferenceName(contig); - read.setReferenceIndex(contigIndex); - read.setReadPairedFlag(false); - read.setReadUnmappedFlag(false); - if ( strandType != StrandType.STRANDLESS ) { - read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line - read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE); - } - read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) - read.setAlignmentStart(refStart); - read.setReadName(readName); - read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION); - read.setReadBases(convertReadBases()); - read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size())); - read.setReadGroup(readGroupRecord); - read.setReducedReadCountsTag(convertBaseCounts()); - - if (hasIndelQualities) { - read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION); - read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION); - } - - return read; - } - - /** - * Checks if the synthetic read is composed exclusively of deletions - * - * @return true if it is, false if it isn't. - */ - private boolean isAllDeletions() { - for (SingleBaseInfo b : basesCountsQuals) - if (b.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) - return false; - return true; - } - - public int size () { - return basesCountsQuals.size(); - } - - private byte [] convertBaseQualities() { - return convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().qual; - } - }); - } - - private byte [] convertInsertionQualities() { - return convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().insertionQual; - } - }); - } - - private byte [] convertDeletionQualities() { - return convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().deletionQual; - } - }); - } - - protected int[] convertBaseCounts() { - int[] variableArray = new int[getReadLengthWithNoDeletions()]; - int i = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) - variableArray[i++] = singleBaseInfo.count; - } - return variableArray; - } - - private byte [] convertReadBases() { - byte [] readArray = new byte[getReadLengthWithNoDeletions()]; - int i = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - final BaseIndex baseIndex = BaseIndexByOrdinal[singleBaseInfo.baseIndexOrdinal]; - if (baseIndex != BaseIndex.D) - readArray[i++] = baseIndex.getByte(); - } - - return readArray; - } - - /** - * Builds the cigar string for the synthetic read - * - * Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read. - * - * @return the cigar string for the synthetic read - */ - private Cigar buildCigar() { - ObjectArrayList cigarElements = new ObjectArrayList(); - CigarOperator cigarOperator = null; - int length = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - final BaseIndex b = BaseIndexByOrdinal[singleBaseInfo.baseIndexOrdinal]; - CigarOperator op; - switch (b) { - case D: - op = CigarOperator.DELETION; - break; - case I: - throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported."); - default: - op = CigarOperator.MATCH_OR_MISMATCH; - break; - } - if (cigarOperator == null) { - if (op == CigarOperator.D) // read cannot start with a deletion - refStart++; // if it does, we need to move the reference start forward - else - cigarOperator = op; - } - else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one - cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator - cigarOperator = op; - length = 0; - } - - if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions) - length++; - } - if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion - cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element - - return new Cigar(cigarElements); - } - - /** - * Shared functionality for all conversion utilities - * - * @param variableIterator the list to convert - * @return a converted variable given the bases and skipping deletions - */ - - private byte [] convertVariableGivenBases (Iterator variableIterator) { - byte [] variableArray = new byte[getReadLengthWithNoDeletions()]; - int i = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - byte count = variableIterator.next(); - if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) - variableArray[i++] = count; - } - return variableArray; - } - - /** - * Shared functionality for all conversion utilities - * - * @return the length of the read with no deletions - */ - private int getReadLengthWithNoDeletions() { - int readLength = basesCountsQuals.size(); - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) - if (singleBaseInfo.baseIndexOrdinal == BaseIndex.D.getOrdinalByte()) - readLength--; - return readLength; - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java deleted file mode 100644 index 6f16a704f..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ /dev/null @@ -1,292 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -/** - * Code for determining which indels are segregating among the samples. - * - * This code is just a refactor of the original code from Guillermo in the UG. - * - * @author Mark DePristo - * @since 3/26/12 - */ -public class ConsensusAlleleCounter { - final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); - private final int minIndelCountForGenotyping; - private final boolean doMultiAllelicCalls; - private final double minFractionInOneSample; - - public ConsensusAlleleCounter(final boolean doMultiAllelicCalls, - final int minIndelCountForGenotyping, - final double minFractionInOneSample) { - this.minIndelCountForGenotyping = minIndelCountForGenotyping; - this.doMultiAllelicCalls = doMultiAllelicCalls; - this.minFractionInOneSample = minFractionInOneSample; - } - - /** - * Returns a list of Alleles at this locus that may be segregating - * - * @param ref - * @param contexts - * @param contextType - * @return - */ - public List computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType) { - final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); - return consensusCountsToAlleles(ref, consensusIndelStrings); - } - - // - // TODO -- WARNING DOESN'T WORK WITH REDUCED READS - // - private Map countConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType) { - final GenomeLoc loc = ref.getLocus(); - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for ( Map.Entry sample : contexts.entrySet() ) { - final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedPileup indelPileup = context.getBasePileup(); - insCount += indelPileup.getNumberOfInsertionsAfterThisElement(); - delCount += indelPileup.getNumberOfDeletionsAfterThisElement(); - } - - if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping ) - return Collections.emptyMap(); - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedPileup indelPileup = context.getBasePileup(); - - final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement(); - final int nReadsOverall = indelPileup.getNumberOfElements(); - - if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) { - continue; - } - - for (PileupElement p : indelPileup) { - final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - - if ( p.isBeforeInsertion() ) { - final String insertionBases = p.getBasesOfImmediatelyFollowingInsertion(); - // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB] - if ( insertionBases == null ) - continue; - - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (Map.Entry s : consensusIndelStrings.entrySet()) { - cList.add(new Pair(s.getKey(), s.getValue())); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(insertionBases)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (insertionBases.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(insertionBases,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(insertionBases,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(insertionBases)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (insertionBases.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(insertionBases,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(insertionBases,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(insertionBases)? consensusIndelStrings.get(insertionBases):0; - cList.add(new Pair(insertionBases,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if ( p.isBeforeDeletionStart() ) { - final String deletionString = String.format("D%d",p.getLengthOfImmediatelyFollowingIndel()); - int cnt = consensusIndelStrings.containsKey(deletionString)? consensusIndelStrings.get(deletionString):0; - consensusIndelStrings.put(deletionString,cnt+1); - } - } - } - - return consensusIndelStrings; - } - - private List consensusCountsToAlleles(final ReferenceContext ref, - final Map consensusIndelStrings) { - final GenomeLoc loc = ref.getLocus(); - final Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - Allele refAllele, altAllele; - - for (final Map.Entry elt : consensusIndelStrings.entrySet()) { - final String s = elt.getKey(); - final int curCnt = elt.getValue(); - int stop = 0; - - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - final int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference - 1, startIdxInReference + dLen); // add reference padding - - if (Allele.acceptableAlleleBases(refBases, false)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(ref.getBase(), false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - final String insertionBases = (char)ref.getBase() + s; // add reference padding - if (Allele.acceptableAlleleBases(insertionBases, false)) { // don't allow N's in insertions - refAllele = Allele.create(ref.getBase(), true); - altAllele = Allele.create(insertionBases, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(Arrays.asList(refAllele, altAllele)); - builder.noGenotypes(); - if (doMultiAllelicCalls) { - vcs.add(builder.make()); - if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) - break; - } else if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - } - - if (vcs.isEmpty()) - return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false); - return mergedVC.getAlleles(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java deleted file mode 100644 index f3b26f295..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ /dev/null @@ -1,512 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; -import org.broadinstitute.sting.utils.genotyper.DiploidGenotype; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.List; - -import static java.lang.Math.log10; -import static java.lang.Math.pow; - -/** - * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, - * and posteriors given a pile of bases and quality scores - * - * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object - * calculates: - * - * P(G | D) = P(G) * P(D | G) - * - * where - * - * P(D | G) = sum_i log10 P(bi | G) - * - * and - * - * P(bi | G) = 1 - P(error | q1) if bi is in G - * = P(error | q1) / 3 if bi is not in G - * - * for homozygous genotypes and for heterozygous genotypes: - * - * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G - * = P(error | q1) / 3 if bi is not in G - * - * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT - * - * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. - * - * The priors contain the relative probabilities of each genotype, and must be provided at object creation. - * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above - * model. - */ -public class DiploidSNPGenotypeLikelihoods implements Cloneable { - - public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE; - - protected final static int FIXED_PLOIDY = 2; - protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; - protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); - protected final static double log10_3 = log10(3.0); - - protected boolean VERBOSE = false; - - // - // The fundamental data arrays associated with a Genotype Likelihoods object - // - protected double[] log10Likelihoods = null; - - // TODO: don't calculate this each time through - protected double log10_PCR_error_3; - protected double log10_1_minus_PCR_error; - - /** - * Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype - * - * @param PCR_error_rate the PCR error rate - */ - public DiploidSNPGenotypeLikelihoods(double PCR_error_rate) { - log10_PCR_error_3 = log10(PCR_error_rate) - log10_3; - log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate); - setToZero(); - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone(); - c.log10Likelihoods = log10Likelihoods.clone(); - return c; - } - - protected void setToZero() { - log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros - } - - /** - * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() - * @return likelihoods array - */ - public double[] getLikelihoods() { - return log10Likelihoods; - } - - // ------------------------------------------------------------------------------------- - // - // add() routines. These are the workhorse routines for calculating the overall genotype - // likelihoods given observed bases and reads. Includes high-level operators all the - // way down to single base and qual functions. - // - // ------------------------------------------------------------------------------------- - - /** - * Updates likelihoods and posteriors to reflect the additional observations contained within the - * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the - * pileup - * - * @param pileup read pileup - * @param ignoreBadBases should we ignore bad bases? - * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? - * @param minBaseQual the minimum base quality at which to consider a base valid - * @return the number of good bases found in the pileup - */ - public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - int n = 0; - - // for each fragment, add to the likelihoods - FragmentCollection fpile = pileup.toFragments(); - - for ( PileupElement p : fpile.getSingletonReads() ) - n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - - for ( List overlappingPair : fpile.getOverlappingPairs() ) - n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - - return n; - } - - public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - byte obsBase = elt.getBase(); - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - if ( qual == 0 ) - return 0; - - if ( elt.getRead().isReducedRead() ) { - // reduced read representation - if ( BaseUtils.isRegularBase( obsBase )) { - int representativeCount = elt.getRepresentativeCount(); - add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods - return representativeCount; // we added nObs bases here - } - - // odd bases or deletions => don't use them - return 0; - } - - return add(obsBase, qual, (byte)0, (byte)0, 1); - } - - public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - final PileupElement p1 = overlappingPair.get(0); - final PileupElement p2 = overlappingPair.get(1); - - final byte observedBase1 = p1.getBase(); - final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - final byte observedBase2 = p2.getBase(); - final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - - if ( qualityScore1 == 0 ) { - if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases - return 0; - else { - return add(observedBase2, qualityScore2, (byte)0, (byte)0); - } - } else { - return add(observedBase1, qualityScore1, observedBase2, qualityScore2); - } - } - - /** - * - * @param obsBase1 first observed base - * @param qual1 base qual of first observed base - * @param obsBase2 second observed base - * @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment - * @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads - * @return 0 if the base is bad, 1 otherwise - */ - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { - // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine - // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. - // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. - - // Just look up the cached result if it's available, or compute and store it - DiploidSNPGenotypeLikelihoods gl; - if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) { - gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); - } else { - gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); - } - - // for bad bases, there are no likelihoods - if ( gl == null ) - return 0; - - double[] likelihoods = gl.getLikelihoods(); - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double likelihood = likelihoods[g.ordinal()]; - log10Likelihoods[g.ordinal()] += likelihood * nObs; - } - - return 1; - } - - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { - return add(obsBase1, qual1, obsBase2, qual2, 1); - } - - // ------------------------------------------------------------------------------------- - // - // Dealing with the cache routines - // - // ------------------------------------------------------------------------------------- - - static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_SAM_QUAL_SCORE +1][BaseUtils.BASES.length+1][QualityUtils.MAX_SAM_QUAL_SCORE +1][MAX_PLOIDY]; - - protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null; - } - - protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy); - if ( gl == null ) - throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d", - observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy)); - return gl; - } - - protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); - setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl); - return gl; - } - - protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][] cache, - byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy, - DiploidSNPGenotypeLikelihoods val ) { - int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); - int j = qualityScore1; - int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; - int l = qualityScore2; - int m = ploidy; - - cache[i][j][k][l][m] = val; - } - - protected DiploidSNPGenotypeLikelihoods getCache(DiploidSNPGenotypeLikelihoods[][][][][] cache, - byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); - int j = qualityScore1; - int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; - int l = qualityScore2; - int m = ploidy; - return cache[i][j][k][l][m]; - } - - protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { - double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); - - try { - - DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone(); - gl.setToZero(); - - // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space - for ( DiploidGenotype g : DiploidGenotype.values() ) { - - // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop - double p_base = 0.0; - p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); - p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); - - final double likelihood = log10(p_base); - gl.log10Likelihoods[g.ordinal()] += likelihood; - } - - if ( VERBOSE ) { - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } - System.out.println(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } - System.out.println(); - } - - return gl; - - } catch ( CloneNotSupportedException e ) { - throw new RuntimeException(e); - } - } - - /** - * Updates likelihoods and posteriors to reflect an additional observation of observedBase with - * qualityScore. - * - * @param observedBase1 the base observed on the 1st read of the fragment - * @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA - * @param observedBase2 the base observed on the 2nd read of the fragment - * @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA - * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) - */ - protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { - double[] log10FourBaseLikelihoods = baseZeros.clone(); - - for ( byte trueBase : BaseUtils.BASES ) { - double likelihood = 0.0; - - for ( byte fragmentBase : BaseUtils.BASES ) { - double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3); - if ( qualityScore1 != 0 ) { - log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1); - } - if ( qualityScore2 != 0 ) { - log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2); - } - - //if ( VERBOSE ) { - // System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n", - // observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood); - //} - - likelihood += pow(10, log10FragmentLikelihood); - } - - log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood); - } - - return log10FourBaseLikelihoods; - } - - /** - * - * @param observedBase observed base - * @param chromBase target base - * @param qual base quality - * @return log10 likelihood - */ - protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) { - - double logP; - - if ( observedBase == chromBase ) { - // the base is consistent with the chromosome -- it's 1 - e - //logP = oneMinusData[qual]; - double e = pow(10, (qual / -10.0)); - logP = log10(1.0 - e); - } else { - // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) - logP = qual / -10.0 + (-log10_3); - } - - //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); - return logP; - } - - /** - * Helper function that returns the phred-scaled base quality score we should use for calculating - * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may - * cap the quality score by the mapping quality of the read itself. - * - * @param p Pileup element - * @param ignoreBadBases Should we ignore bad bases? - * @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read? - * @param minBaseQual Minimum allowed base quality - * @return the actual base quality to use - */ - private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) - return 0; - - byte qual = p.getQual(); - - if ( qual > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")"); - if ( capBaseQualsAtMappingQual ) - qual = (byte) Math.min( 0xff & qual, p.getMappingQual()); - if ( (int)qual < minBaseQual ) - qual = (byte)0; - - return qual; - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // helper routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Return a string representation of this object in a moderately usable form - * - * @return string representation - */ - public String toString() { - double sum = 0; - StringBuilder s = new StringBuilder(); - for (DiploidGenotype g : DiploidGenotype.values()) { - s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()])); - sum += Math.pow(10,log10Likelihoods[g.ordinal()]); - } - s.append(String.format(" %f", sum)); - return s.toString(); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // Validation routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - public boolean validate() { - return validate(true); - } - - public boolean validate(boolean throwException) { - try { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - String bad = null; - - int i = g.ordinal(); - if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { - bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); - } - - if ( bad != null ) { - throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - // - // Constant static data - // - private final static double[] genotypeZeros = new double[DiploidGenotype.values().length]; - private final static double[] baseZeros = new double[BaseUtils.BASES.length]; - - static { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - genotypeZeros[g.ordinal()] = 0.0; - } - for ( byte base : BaseUtils.BASES ) { - baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0; - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java deleted file mode 100644 index 7ce736b0c..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ /dev/null @@ -1,343 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 7/21/11 - * Time: 2:21 PM - * - * This is a site based implementation of an Error Model. The error model is a probability - * distribution for the site given the phred scaled quality. - */ -public class ErrorModel { - private byte maxQualityScore; - private byte minQualityScore; - private byte phredScaledPrior; - private double log10minPower; - private int refDepth; - private boolean hasData = false; - private ProbabilityVector probabilityVector; - private static final boolean compressRange = false; - - private static final double log10MinusE = Math.log10(Math.exp(1.0)); - private static final boolean DEBUG = false; - /** - * Calculates the probability of the data (reference sample reads) given the phred scaled site quality score. - * - * @param UAC Argument Collection - * @param refSamplePileup Reference sample pileup - * @param refSampleVC VC with True alleles in reference sample pileup - */ - public ErrorModel (final UnifiedArgumentCollection UAC, - final ReadBackedPileup refSamplePileup, - VariantContext refSampleVC, final ReferenceContext refContext) { - this.maxQualityScore = UAC.maxQualityScore; - this.minQualityScore = UAC.minQualityScore; - this.phredScaledPrior = UAC.phredScaledPrior; - log10minPower = Math.log10(UAC.minPower); - - PairHMMIndelErrorModel pairModel = null; - LinkedHashMap haplotypeMap = null; - double[][] perReadLikelihoods = null; - - double[] model = new double[maxQualityScore+1]; - Arrays.fill(model,Double.NEGATIVE_INFINITY); - - boolean hasCalledAlleles = false; - - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - if (refSampleVC != null) { - - for (Allele allele : refSampleVC.getAlleles()) { - if (allele.isCalled()) { - hasCalledAlleles = true; - break; - } - } - haplotypeMap = new LinkedHashMap(); - if (refSampleVC.isIndel()) { - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); - IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements - } - } - - double p = QualityUtils.qualToErrorProbLog10((byte)(maxQualityScore-minQualityScore)); - if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) { - for (byte q=minQualityScore; q<=maxQualityScore; q++) { - // maximum uncertainty if there's no ref data at site - model[q] = p; - } - this.refDepth = 0; - } - else { - hasData = true; - int matches = 0; - int coverage = 0; - - Allele refAllele = refSampleVC.getReference(); - - if ( refSampleVC.isIndel()) { - final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; - //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; - final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); - if (!haplotypeMap.isEmpty()) - perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); - } - int idx = 0; - for (PileupElement refPileupElement : refSamplePileup) { - if (DEBUG) - System.out.println(refPileupElement.toString()); - boolean isMatch = false; - for (Allele allele : refSampleVC.getAlleles()) { - boolean m = pileupElementMatches(refPileupElement, allele, refAllele, refContext.getBase()); - if (DEBUG) System.out.println(m); - isMatch |= m; - } - if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) { - // ignore match/mismatch if reads, as determined by their likelihood, are not informative - double[] perAlleleLikelihoods = perReadLikelihoods[idx++]; - if (!isInformativeElement(perAlleleLikelihoods)) - matches++; - else - matches += (isMatch?1:0); - - } else { - matches += (isMatch?1:0); - } - coverage++; - } - - int mismatches = coverage - matches; - //System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches); - for (byte q=minQualityScore; q<=maxQualityScore; q++) { - if (coverage==0) - model[q] = p; - else - model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches); - } - this.refDepth = coverage; - } - - // compress probability vector - this.probabilityVector = new ProbabilityVector(model, compressRange); - } - - - @Requires("likelihoods.length>0") - private boolean isInformativeElement(double[] likelihoods) { - // if likelihoods are the same, they're not informative - final double thresh = 0.1; - int maxIdx = MathUtils.maxElementIndex(likelihoods); - int minIdx = MathUtils.minElementIndex(likelihoods); - if (likelihoods[maxIdx]-likelihoods[minIdx]< thresh) - return false; - else - return true; - } - /** - * Simple constructor that just takes a given log-probability vector as error model. - * Only intended for unit testing, not general usage. - * @param pvector Given vector of log-probabilities - * - */ - public ErrorModel(double[] pvector) { - this.maxQualityScore = (byte)(pvector.length-1); - this.minQualityScore = 0; - this.probabilityVector = new ProbabilityVector(pvector, compressRange); - this.hasData = true; - - } - - public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele, byte refBase) { - if (DEBUG) - System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n", - pileupElement.getBase(), pileupElement.isBeforeDeletionStart(), - pileupElement.isBeforeInsertion(),pileupElement.getBasesOfImmediatelyFollowingInsertion(),pileupElement.getLengthOfImmediatelyFollowingIndel(), allele.toString(), refAllele.toString()); - - //pileupElement. - // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch - if (allele.isReference()) { - // for a ref allele, any base mismatch or new indel is a mismatch. - if(allele.getBases().length>0) - // todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location - // could clean up/simplify this when unpadding is removed - return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); - else - // either null allele to compare, or ref/alt lengths are different (indel by definition). - // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch - return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); - } - - // for non-ref alleles to compare: - if (refAllele.getBases().length == allele.getBases().length) - // alleles have the same length (eg snp or mnp) - return pileupElement.getBase() == allele.getBases()[0]; - - // for non-ref alleles, - byte[] alleleBases = allele.getBases(); - int eventLength = alleleBases.length - refAllele.getBases().length; - if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getLengthOfImmediatelyFollowingIndel() == -eventLength) - return true; - - if (eventLength > 0 && pileupElement.isBeforeInsertion() && - Arrays.equals(pileupElement.getBasesOfImmediatelyFollowingInsertion().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't - return true; - - return false; - } - - - /** - * What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches, - * and assuming each match is independent of each other and that the match probability is just dependent of - * the site quality, so p = 10.^-q/10. - * Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high), - * to avoid underflows we'll use the Poisson approximation with lambda = N*p. - * Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above. - * @param q Desired q to get likelihood from - * @param coverage Total coverage - * @param mismatches Number of mismatches - * @return Likelihood of observations as a function of q - */ - @Requires({ - "q >= minQualityScore", - "q <= maxQualityScore", - "coverage >= 0", - "mismatches >= 0", - "mismatches <= coverage" - }) - private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) { - // same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows - double lambda = QualityUtils.qualToErrorProb(q) * (double )coverage; - // log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k) - return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches); - } - - @Requires({"qual-minQualityScore <= maxQualityScore"}) - public double getSiteLogErrorProbabilityGivenQual (int qual) { - return probabilityVector.getLogProbabilityForIndex(qual); - } - - public byte getMaxQualityScore() { - return maxQualityScore; - } - - public byte getMinQualityScore() { - return minQualityScore; - } - - public int getMinSignificantQualityScore() { - return new ProbabilityVector(probabilityVector,true).getMinVal(); - } - - public int getMaxSignificantQualityScore() { - return new ProbabilityVector(probabilityVector,true).getMaxVal(); - } - - public int getReferenceDepth() { - return refDepth; - } - public boolean hasData() { - return hasData; - } - - public ProbabilityVector getErrorModelVector() { - return probabilityVector; - } - - public String toString() { - StringBuilder result = new StringBuilder("("); - boolean skipComma = true; - for (double v : probabilityVector.getProbabilityVector()) { - if (skipComma) { - skipComma = false; - } - else { - result.append(","); - } - result.append(String.format("%.4f", v)); - } - result.append(")"); - return result.toString(); - } - - public static int getTotalReferenceDepth(HashMap perLaneErrorModels) { - int n=0; - for (ErrorModel e : perLaneErrorModels.values()) { - n += e.getReferenceDepth(); - } - return n; - } - - /* -@Requires({"maxAlleleCount >= 0"}) -//todo -- memoize this function - public boolean hasPowerForMaxAC (int maxAlleleCount) { - int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount)); - double log10CumSum = getCumulativeSum(siteQ); - return log10CumSum < log10minPower; - } */ -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java deleted file mode 100644 index 2f2a93fa4..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ /dev/null @@ -1,270 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; -import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: 5/18/12 - * Time: 10:06 AM - * To change this template use File | Settings | File Templates. - */ -public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods { - final PairHMMIndelErrorModel pairModel; - final LinkedHashMap haplotypeMap; - final ReferenceContext refContext; - final int eventLength; - double[][] readHaplotypeLikelihoods; - - final byte refBase; - final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap; - - public GeneralPloidyIndelGenotypeLikelihoods(final List alleles, - final double[] logLikelihoods, - final int ploidy, - final HashMap perLaneErrorModels, - final boolean ignoreLaneInformation, - final PairHMMIndelErrorModel pairModel, - final LinkedHashMap haplotypeMap, - final ReferenceContext referenceContext, - final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { - super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); - this.pairModel = pairModel; - this.haplotypeMap = haplotypeMap; - this.refContext = referenceContext; - this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles); - // todo - not needed if indel alleles have base at current position - this.refBase = referenceContext.getBase(); - this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap; - } - - // ------------------------------------------------------------------------------------- - // - // add() routines. These are the workhorse routines for calculating the overall genotype - // likelihoods given observed bases and reads. Includes high-level operators all the - // way down to single base and qual functions. - // - // ------------------------------------------------------------------------------------- - - /** - * Updates likelihoods and posteriors to reflect the additional observations contained within the - * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the - * pileup - * - * @param pileup read pileup - * @param UAC the minimum base quality at which to consider a base valid - * @return the number of good bases found in the pileup - */ - public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) { - int n = 0; - - if (!hasReferenceSampleData) { - // no error models - return add(pileup, (ErrorModel)null); - } - for (String laneID : perLaneErrorModels.keySet() ) { - // get pileup for this lane - ReadBackedPileup perLanePileup; - if (ignoreLaneInformation) - perLanePileup = pileup; - else - perLanePileup = pileup.getPileupForLane(laneID); - - if (perLanePileup == null || perLanePileup.isEmpty()) - continue; - - ErrorModel errorModel = perLaneErrorModels.get(laneID); - n += add(perLanePileup, errorModel); - if (ignoreLaneInformation) - break; - - } - - return n; - } - - /** - * Calculates the pool's probability for all possible allele counts for all indel alleles observed. - * Calculation is based on the error model - * generated by the reference sample on the same lane. The probability is given by : - * - * Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) * - * Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) * - * [j1 * (1-eq)/2n + eq/3*(2*N-j1) - * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * - * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT - * - * log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) * - * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * - * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT) - * = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q) - * )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N) - * + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N) - * - * Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3) - * - * Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k], - * nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) - * - * If pileup data comes from several error models (because lanes can have different error models), - * Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2)) - * = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N - * - * So, for each lane, build error model and combine lanes. - * To store model, can do - * for jA=0:2N - * for jC = 0:2N-jA - * for jG = 0:2N-jA-jC - * for jT = 0:2N-jA-jC-jG - * Q(jA,jC,jG,jT) - * for k = minSiteQual:maxSiteQual - * likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) - * - * - * - * where: nA,nC,nG,nT = counts of bases observed in pileup. - * - * - * @param pileup Base pileup - * @param errorModel Site error model - * @return Number of bases added - */ - private int add(ReadBackedPileup pileup, ErrorModel errorModel) { - int n=0; - - // Number of alleless in pileup, in that order - List numSeenBases = new ArrayList(this.alleles.size()); - - if (!hasReferenceSampleData) { - - final int readCounts[] = new int[pileup.getNumberOfElements()]; - readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); - n = readHaplotypeLikelihoods.length; - } else { - Allele refAllele = null; - for (Allele a:alleles) { - numSeenBases.add(0); - if (a.isReference()) - refAllele = a; - } - - if (refAllele == null) - throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!"); - - // count number of elements in pileup - for (PileupElement elt : pileup) { - if (VERBOSE) - System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getBasesOfImmediatelyFollowingInsertion(),elt.getLengthOfImmediatelyFollowingIndel()); - int idx =0; - for (Allele allele : alleles) { - int cnt = numSeenBases.get(idx); - numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele, refBase)?1:0)); - } - - n++; - - } - } - computeLikelihoods(errorModel, alleles, numSeenBases, pileup); - return n; - } - - - - /** - * Compute likelihood of current conformation - * - * @param ACset Count to compute - * @param errorModel Site-specific error model object - * @param alleleList List of alleles - * @param numObservations Number of observations for each allele in alleleList - */ - public void getLikelihoodOfConformation(final ExactACset ACset, - final ErrorModel errorModel, - final List alleleList, - final List numObservations, - final ReadBackedPileup pileup) { - final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size()); - double p1 = 0.0; - - if (!hasReferenceSampleData) { - // no error model: use pair HMM likelihoods - for (int i=0; i < readHaplotypeLikelihoods.length; i++) { - double acc[] = new double[alleleList.size()]; - for (int k=0; k < acc.length; k++ ) - acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY; - p1 += MathUtils.log10sumLog10(acc); - } - - } else { - final int minQ = errorModel.getMinSignificantQualityScore(); - final int maxQ = errorModel.getMaxSignificantQualityScore(); - final double[] acVec = new double[maxQ - minQ + 1]; - - - for (int k=minQ; k<=maxQ; k++) { - int idx=0; - for (int n : numObservations) - acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k]; - } - p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec); - } - ACset.getLog10Likelihoods()[0] = p1; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java deleted file mode 100644 index f48ae81cf..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.List; -import java.util.Map; - - -/** - * The model representing how we calculate genotype likelihoods - */ -public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { - - public static final String DUMMY_LANE = "Lane1"; - public static final String DUMMY_SAMPLE_NAME = "DummySample1"; - - /* public enum Model { - SNP, - INDEL, - BOTH - } - */ - public enum Model { - SNP, - INDEL, - GENERALPLOIDYSNP, - GENERALPLOIDYINDEL, - BOTH - } - - public enum GENOTYPING_MODE { - /** the Unified Genotyper will choose the most likely alternate allele */ - DISCOVERY, - /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ - GENOTYPE_GIVEN_ALLELES - } - - protected final UnifiedArgumentCollection UAC; - protected Logger logger; - - /** - * Create a new object - * @param logger logger - * @param UAC unified arg collection - */ - protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { - if ( logger == null || UAC == null ) throw new ReviewedStingException("Bad arguments"); - this.UAC = UAC; - this.logger = logger; - } - - /** - * Can be overridden by concrete subclasses - * - * @param tracker rod data - * @param ref reference context - * @param contexts stratified alignment contexts - * @param contextType stratified context type - * @param allAllelesToUse the alternate allele to use, null if not set - * @param useBAQedPileup should we use the BAQed pileup or the raw one? - * @param locParser Genome Loc Parser - * @return variant context where genotypes are no-called but with GLs - */ - public abstract VariantContext getLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map contexts, - final AlignmentContextUtils.ReadOrientation contextType, - final List allAllelesToUse, - final boolean useBAQedPileup, - final GenomeLocParser locParser, - final Map perReadAlleleLikelihoodMap); - - - protected int getFilteredDepth(ReadBackedPileup pileup) { - int count = 0; - for ( PileupElement p : pileup ) { - if ( BaseUtils.isRegularBase( p.getBase() ) ) - count += p.getRepresentativeCount(); - } - - return count; - } - -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java deleted file mode 100644 index 4a3231b3e..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ /dev/null @@ -1,262 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private static final int HAPLOTYPE_SIZE = 80; - - private boolean DEBUG = false; - private boolean ignoreSNPAllelesWhenGenotypingIndels = false; - private PairHMMIndelErrorModel pairModel; - - - private LinkedHashMap haplotypeMap; - - private List alleleList = new ArrayList(); - - - protected IndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, - final Logger logger) { - super(UAC, logger); - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); - DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); - ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; - } - - protected static List computeConsensusAlleles(final ReferenceContext ref, - final Map contexts, - final AlignmentContextUtils.ReadOrientation contextType, - final UnifiedArgumentCollection UAC) { - ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); - return counter.computeConsensusAlleles(ref, contexts, contextType); - } - - private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); - - - public VariantContext getLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map contexts, - final AlignmentContextUtils.ReadOrientation contextType, - final List allAllelesToUse, - final boolean useBAQedPileup, - final GenomeLocParser locParser, - final Map perReadAlleleLikelihoodMap) { - - GenomeLoc loc = ref.getLocus(); -// if (!ref.getLocus().equals(lastSiteVisited)) { - if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { - // starting a new site: clear allele list - haplotypeMap.clear(); - perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods - alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels); - if (alleleList.isEmpty()) - return null; - } - - getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); // will update haplotypeMap adding elements - if (haplotypeMap == null || haplotypeMap.isEmpty()) - return null; - - // start making the VariantContext - // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base. - final int endLoc = loc.getStart() + alleleList.get(0).length() - 1; - final int eventLength = getEventLength(alleleList); - - final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList); - - // create the genotypes; no-call everyone for now - GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); - - // For each sample, get genotype likelihoods based on pileup - // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. - - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ - // no likelihoods have been computed for this sample at this site - perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap()); - } - final ReadBackedPileup pileup = context.getBasePileup(); - if (pileup != null) { - final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey())); - b.PL(genotypeLikelihoods); - b.DP(getFilteredDepth(pileup)); - genotypes.add(b.make()); - - if (DEBUG) { - System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); - for (int k = 0; k < genotypeLikelihoods.length; k++) - System.out.format("%1.4f ", genotypeLikelihoods[k]); - System.out.println(); - } - } - } - - return builder.genotypes(genotypes).make(); - } - - public static void getHaplotypeMapFromAlleles(final List alleleList, - final ReferenceContext ref, - final GenomeLoc loc, - final LinkedHashMap haplotypeMap) { - // protect against having an indel too close to the edge of a contig - if (loc.getStart() <= HAPLOTYPE_SIZE) - haplotypeMap.clear(); - // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) - else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) - haplotypeMap.clear(); - else if (alleleList.isEmpty()) - haplotypeMap.clear(); - else { - final int eventLength = getEventLength(alleleList); - final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; - final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - - if (hsize <= 0) // protect against event lengths larger than ref window sizes - haplotypeMap.clear(); - else - haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), - ref, hsize, numPrefBases)); - } - } - - public static int getEventLength(List alleleList) { - Allele refAllele = alleleList.get(0); - Allele altAllele = alleleList.get(1); - // look for alt allele that has biggest length distance to ref allele - int maxLenDiff = 0; - for (Allele a : alleleList) { - if (a.isNonReference()) { - int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); - if (lenDiff > maxLenDiff) { - maxLenDiff = lenDiff; - altAllele = a; - } - } - } - - return altAllele.getBaseString().length() - refAllele.getBaseString().length(); - - } - - public static List getInitialAlleleList(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map contexts, - final AlignmentContextUtils.ReadOrientation contextType, - final UnifiedArgumentCollection UAC, - final boolean ignoreSNPAllelesWhenGenotypingIndels) { - - List alleles = new ArrayList(); - if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { - VariantContext vc = null; - for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) { - if (vc_input != null && - allowableTypes.contains(vc_input.getType()) && - ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; - } - } - // ignore places where we don't have a variant - if (vc == null) - return alleles; - - if (ignoreSNPAllelesWhenGenotypingIndels) { - // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it - for (Allele a : vc.getAlleles()) - if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) - continue; - else - alleles.add(a); - - } else { - alleles.addAll(vc.getAlleles()); - } - - } else { - alleles = computeConsensusAlleles(ref, contexts, contextType, UAC); - } - return alleles; - } - - // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, - // so that per-sample DP will include deletions covering the event. - protected int getFilteredDepth(ReadBackedPileup pileup) { - int count = 0; - for (PileupElement p : pileup) { - if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase())) - count += p.getRepresentativeCount(); - } - - return count; - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java deleted file mode 100644 index 5c6e9dc01..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ /dev/null @@ -1,847 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.PrintStream; -import java.lang.reflect.Constructor; -import java.util.*; - -public class UnifiedGenotyperEngine { - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - private static final String GPSTRING = "GENERALPLOIDY"; - - public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; - public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; - - public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; - public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; - - private static final int SNP_MODEL = 0; - private static final int INDEL_MODEL = 1; - - public enum OUTPUT_MODE { - /** produces calls only at variant sites */ - EMIT_VARIANTS_ONLY, - /** produces calls at variant sites and confident reference sites */ - EMIT_ALL_CONFIDENT_SITES, - /** produces calls at any callable site regardless of confidence; this argument is intended only for point - * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by - * no means produce a comprehensive set of indels in DISCOVERY mode */ - EMIT_ALL_SITES - } - - // the unified argument collection - private final UnifiedArgumentCollection UAC; - public UnifiedArgumentCollection getUAC() { return UAC; } - - // the annotation engine - private final VariantAnnotatorEngine annotationEngine; - - // the model used for calculating genotypes - private ThreadLocal> glcm = new ThreadLocal>(); - private final List modelsToUse = new ArrayList(2); - - // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); - - // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[] log10AlleleFrequencyPriorsSNPs; - private final double[] log10AlleleFrequencyPriorsIndels; - - // samples in input - private final Set samples; - - // the various loggers and writers - private final Logger logger; - private final PrintStream verboseWriter; - - // number of chromosomes (ploidy * samples) in input - private final int ploidy; - private final int N; - - // the standard filter to use for calls below the confidence threshold but above the emit threshold - private static final Set filter = new HashSet(1); - - private final GenomeLocParser genomeLocParser; - private final boolean BAQEnabledOnCMDLine; - - // --------------------------------------------------------------------------------------------------------- - // - // Public interface functions - // - // --------------------------------------------------------------------------------------------------------- - @Requires({"toolkit != null", "UAC != null"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY); - } - - protected UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, Set samples, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - } - - @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples, int ploidy) { - this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; - genomeLocParser = toolkit.getGenomeLocParser(); - this.samples = new TreeSet(samples); - // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ - this.UAC = UAC; - - this.logger = logger; - this.verboseWriter = verboseWriter; - this.annotationEngine = engine; - - this.ploidy = ploidy; - this.N = samples.size() * ploidy; - log10AlleleFrequencyPriorsSNPs = new double[N+1]; - log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); - - filter.add(LOW_QUAL_FILTER_NAME); - - determineGLModelsToUse(); - - // do argument checking - if (UAC.annotateAllSitesWithPLs) { - if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP)) - throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs"); - - } - } - - /** - * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) - * - * same as the full call but with allSamples == null - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object - */ - public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext) { - return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); - } - - - /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. - * - * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype - * for every sample in allSamples. If it's null there's no such guarentee. Providing this - * argument is critical when the resulting calls will be written to a VCF file. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) - * @return the VariantCallContext object - */ - public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Set allSamples) { - final List results = new ArrayList(2); - - final List models = getGLModelsToUse(tracker, refContext, rawContext); - - final Map perReadAlleleLikelihoodMap = new HashMap(); - - if ( models.isEmpty() ) { - results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); - } - else { - for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { - perReadAlleleLikelihoodMap.clear(); - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - if ( stratifiedContexts == null ) { - results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); - } - else { - final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); - if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); -// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set -// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) -// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); - - } - } - } - - return results; - } - - /** - * Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) - * @return the VariantContext object - */ - public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Map perReadAlleleLikelihoodMap) { - final List models = getGLModelsToUse(tracker, refContext, rawContext); - if ( models.isEmpty() ) { - return null; - } - - for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - // return the first valid one we encounter - if ( stratifiedContexts != null ) - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); - - } - - return null; - } - - /** - * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final VariantContext vc) { - final List models = getGLModelsToUse(tracker, refContext, rawContext); - if ( models.isEmpty() ) { - return null; - } - - // return the first one - final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); - } - - /** - * Compute genotypes at a given locus. - * - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(VariantContext vc) { - return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); - } - - - // --------------------------------------------------------------------------------------------------------- - // - // Private implementation helpers - // - // --------------------------------------------------------------------------------------------------------- - - // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final Map stratifiedContexts, - final AlignmentContextUtils.ReadOrientation type, - final List alternateAllelesToUse, - final boolean useBAQedPileup, - final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - - // initialize the data for this thread if that hasn't been done yet - if ( glcm.get() == null ) { - glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); - } - - return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); - } - - private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { - VariantContext vc; - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); - if ( vcInput == null ) - return null; - vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()).make(); - } else { - // deal with bad/non-standard reference bases - if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) - return null; - - Set alleles = new HashSet(); - alleles.add(Allele.create(ref.getBase(), true)); - vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); - } - - if ( annotationEngine != null ) { - // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - final ReadBackedPileup pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); - } - - return new VariantCallContext(vc, false); - } - - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); - } - - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(null, null, null, null, vc, model, null); - } - - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Map stratifiedContexts, - final VariantContext vc, - final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); - } - - /** - * Main entry function to calculate genotypes of a given VC with corresponding GL's - * @param tracker Tracker - * @param refContext Reference context - * @param rawContext Raw context - * @param stratifiedContexts Stratified alignment contexts - * @param vc Input VC - * @param model GL calculation model - * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc - * @return VC with assigned genotypes - */ - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext rawContext, Map stratifiedContexts, - final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap) { - - boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; - - // TODO TODO TODO TODO - // REFACTOR THIS FUNCTION, TOO UNWIELDY!! - - // initialize the data for this thread if that hasn't been done yet - if ( afcm.get() == null ) { - afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); - } - - // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call - if (!canVCbeGenotyped(vc)) { - if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) - return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); - else - return null; - - } - - // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) { - if ( limitedContext ) - return null; - return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? - estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : - generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); - } - - AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); - - // is the most likely frequency conformation AC=0 for all alternate alleles? - boolean bestGuessIsRef = true; - - // determine which alternate alleles have AF>0 - final List myAlleles = new ArrayList(vc.getAlleles().size()); - final List alleleCountsofMLE = new ArrayList(vc.getAlleles().size()); - myAlleles.add(vc.getReference()); - for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { - final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); - if ( alternateAllele.isReference() ) - continue; - - // Compute if the site is considered polymorphic with sufficient confidence relative to our - // phred-scaled emission QUAL - final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - - // if the most likely AC is not 0, then this is a good alternate allele to use - if ( isNonRef ) { - myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); - bestGuessIsRef = false; - } - // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele - else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.annotateAllSitesWithPLs) { - myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); - } - } - - final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); - - // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice - final double phredScaledConfidence = - Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs - ? -10 * AFresult.getLog10PosteriorOfAFEq0() - : -10 * AFresult.getLog10PosteriorOfAFGT0()); - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { - // technically, at this point our confidence in a reference call isn't accurately estimated - // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); - } - - // start constructing the resulting VC - final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); - final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); - builder.log10PError(phredScaledConfidence/-10.0); - if ( ! passesCallThreshold(phredScaledConfidence) ) - builder.filters(filter); - - // create the genotypes - final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); - builder.genotypes(genotypes); - - // print out stats if we have a writer - if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - final HashMap attributes = new HashMap(); - - // inherit attributed from input vc if requested - if (inheritAttributesFromInputVC) - attributes.putAll(vc.getAttributes()); - // if the site was downsampled, record that fact - if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) - attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - - if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) - attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); - - // add the MLE AC and AF annotations - if ( alleleCountsofMLE.size() > 0 ) { - attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); - final int AN = builder.make().getCalledChrCount(); - final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); - // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) - for ( int AC : alleleCountsofMLE ) - MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); - attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); - } - - if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { - //final boolean DEBUG_SLOD = false; - - // the overall lod - //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - - List allAllelesToUse = builder.make().getAlleles(); - - // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); - //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); - double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); - - // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); - //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); - double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); - - double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; - double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; - //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - - // strand score is max bias between forward and reverse strands - double strandScore = Math.max(forwardLod, reverseLod); - // rescale by a factor of 10 - strandScore *= 10.0; - //logger.debug(String.format("SLOD=%f", strandScore)); - - if ( !Double.isNaN(strandScore) ) - attributes.put("SB", strandScore); - } - - // finish constructing the resulting VC - builder.attributes(attributes); - VariantContext vcCall = builder.make(); - - if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine - // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - final ReadBackedPileup pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); - } - - // if we are subsetting alleles (either because there were too many or because some were not polymorphic) - // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync - vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); - - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); - } - - /** - * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. - * @param vc Input VC - * @return Status check - */ - @Requires("vc != null") - protected boolean canVCbeGenotyped(final VariantContext vc) { - // protect against too many alternate alleles that we can't even run AF on: - if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { - logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + - " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); - return false; - } - else return true; - - } - - private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { - - if ( !BaseUtils.isRegularBase(refContext.getBase()) ) - return null; - - Map stratifiedContexts = null; - - if ( model.name().contains("INDEL") ) { - - final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); - // don't call when there is no coverage - if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) - return null; - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - } else if ( model.name().contains("SNP") ) { - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); - - if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { - int numDeletions = 0; - for ( final PileupElement p : rawContext.getBasePileup() ) { - if ( p.isDeletion() ) - numDeletions += p.getRepresentativeCount(); - } - if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { - return null; - } - } - } - - return stratifiedContexts; - } - - private final double getRefBinomialProbLog10(final int depth) { - return MathUtils.log10BinomialProbability(depth, 0); - } - - private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { - if ( contexts == null ) - return null; - - double log10POfRef = Math.log10(initialPofRef); - - // for each sample that we haven't examined yet - for ( String sample : samples ) { - final AlignmentContext context = contexts.get(sample); - if ( ignoreCoveredSamples && context != null ) - continue; - final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage(); - log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta); - } - - return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); - } - - /** - * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference - * - * Assumes the sample is diploid - * - * @param depth the depth of the sample - * @param theta the heterozygosity of this species (between 0 and 1) - * @return a valid log10 probability of the sample being hom-ref - */ - @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"}) - @Ensures("MathUtils.goodLog10Probability(result)") - protected double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) { - final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); - return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); - } - - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { - Allele refAllele = null, altAllele = null; - for ( Allele allele : vc.getAlleles() ) { - if ( allele.isReference() ) - refAllele = allele; - else - altAllele = allele; - } - - for (int i = 0; i <= N; i++) { - StringBuilder AFline = new StringBuilder("AFINFO\t"); - AFline.append(pos); - AFline.append("\t"); - AFline.append(refAllele); - AFline.append("\t"); - if ( altAllele != null ) - AFline.append(altAllele); - else - AFline.append("N/A"); - AFline.append("\t"); - AFline.append(i + "/" + N + "\t"); - AFline.append(String.format("%.2f\t", ((float)i)/N)); - AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - verboseWriter.println(AFline.toString()); - } - - verboseWriter.println("P(f>0) = " + PofF); - verboseWriter.println("Qscore = " + phredScaledConfidence); - verboseWriter.println(); - } - - protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - } - - protected boolean passesCallThreshold(double conf) { - return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; - } - - protected boolean confidentlyCalled(double conf, double PofF) { - return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING || - (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); - } - - private void determineGLModelsToUse() { - String modelPrefix = ""; - if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) - modelPrefix = GPSTRING; - - // GGA mode => must initialize both the SNP and indel models - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - else { - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); - } - } - - // decide whether we are currently processing SNPs, indels, neither, or both - private List getGLModelsToUse(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext) { - if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) - return modelsToUse; - - if ( modelsToUse.size() != 2 ) - throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse); - - // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested - final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); - - if ( vcInput == null ) { - return Collections.emptyList(); // no work to be done - } else if ( vcInput.isSNP() ) { - return Collections.singletonList(modelsToUse.get(SNP_MODEL)); - } else if ( vcInput.isIndel() || vcInput.isMixed() ) { - return Collections.singletonList(modelsToUse.get(INDEL_MODEL)); - } else { - return Collections.emptyList(); // No support for other types yet - } - } - - /** - * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, - * where Pr(AC=i) = theta/i where theta is heterozygosity - * @param N Number of chromosomes - * @param priors (output) array to be filled with priors - * @param heterozygosity default heterozygosity to use, if inputPriors is empty - * @param inputPriors Input priors to use (in which case heterozygosity is ignored) - */ - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { - - - double sum = 0.0; - - if (!inputPriors.isEmpty()) { - // user-specified priors - if (inputPriors.size() != N) - throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); - - int idx = 1; - for (final double prior: inputPriors) { - if (prior < 0.0) - throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); - priors[idx++] = Math.log10(prior); - sum += prior; - } - } - else { - // for each i - for (int i = 1; i <= N; i++) { - final double value = heterozygosity / (double)i; - priors[i] = Math.log10(value); - sum += value; - } - } - - // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) - if (sum > 1.0) { - throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); - } - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[0] = Math.log10(1.0 - sum); - } - - protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { - if (model.name().toUpperCase().contains("SNP")) - return log10AlleleFrequencyPriorsSNPs; - else if (model.name().toUpperCase().contains("INDEL")) - return log10AlleleFrequencyPriorsIndels; - else - throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - - } - - protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { - if( model.name().contains("SNP") ) - return HUMAN_SNP_HETEROZYGOSITY; - if( model.name().contains("INDEL") ) - return HUMAN_INDEL_HETEROZYGOSITY; - else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - - private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { - - final Map glcm = new HashMap(); - final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); - - for (int i = 0; i < glmClasses.size(); i++) { - final Class glmClass = glmClasses.get(i); - final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); - try { - final Object args[] = new Object[]{UAC,logger}; - final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); - glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); - } - catch (Exception e) { - throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); - } - } - - return glcm; - } - - public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { - if ( tracker == null || ref == null || logger == null ) - return null; - VariantContext vc = null; - - // search for usable record - for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { - if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { - if ( vc == null ) { - vc = vc_input; - } else { - logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); - } - } - } - - return vc; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java deleted file mode 100644 index 2ece18002..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ /dev/null @@ -1,360 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public abstract class DiploidExactAFCalc extends ExactAFCalc { - public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); - } - - @Override - protected AFCalcResult computeLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { - final int numAlternateAlleles = vc.getNAlleles() - 1; - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); - final int numSamples = genotypeLikelihoods.size()-1; - final int numChr = 2*numSamples; - - // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); - - // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(numChr+1); - - // add AC=0 to the queue - final int[] zeroCounts = new int[numAlternateAlleles]; - ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); - ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.getACcounts(), zeroSet); - - while ( !ACqueue.isEmpty() ) { - getStateTracker().incNEvaluations(); // keep track of the number of evaluations - - // compute log10Likelihoods - final ExactACset set = ACqueue.remove(); - - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } - - return getResultFromFinalState(vc, log10AlleleFrequencyPriors); - } - - @Override - protected VariantContext reduceScope(final VariantContext vc) { - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { - logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - VariantContextBuilder builder = new VariantContextBuilder(vc); - List alleles = new ArrayList(getMaxAltAlleles() + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); - builder.alleles(alleles); - builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL)); - return builder.make(); - } else { - return vc; - } - } - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes(), true); - for ( final double[] likelihoods : GLs ) { - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); - if ( alleles.alleleIndex1 != 0 ) - likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - // don't double-count it - if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) - likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - } - } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; - } - - private static final class DependentSet { - public final int[] ACcounts; - public final int PLindex; - - public DependentSet(final int[] ACcounts, final int PLindex) { - this.ACcounts = ACcounts; - this.PLindex = PLindex; - } - } - - private double calculateAlleleCountConformation(final ExactACset set, - final ArrayList genotypeLikelihoods, - final int numChr, - final LinkedList ACqueue, - final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors) { - - //if ( DEBUG ) - // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); - - // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); - - final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - - // can we abort early because the log10Likelihoods are so small? - if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { - //if ( DEBUG ) - // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); - return log10LofK; - } - - // iterate over higher frequencies if possible - final int ACwiggle = numChr - set.getACsum(); - if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies - return log10LofK; - - final int numAltAlleles = set.getACcounts().getCounts().length; - - // add conformations for the k+1 case - for ( int allele = 0; allele < numAltAlleles; allele++ ) { - final int[] ACcountsClone = set.getACcounts().getCounts().clone(); - ACcountsClone[allele]++; - // to get to this conformation, a sample would need to be AB (remember that ref=0) - final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different - if ( ACwiggle > 1 ) { - final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); - final ArrayList sameAlleles = new ArrayList(numAltAlleles); - - for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { - for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { - final int[] ACcountsClone = set.getACcounts().getCounts().clone(); - ACcountsClone[allele_i]++; - ACcountsClone[allele_j]++; - - // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) - final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); - if ( allele_i == allele_j ) - sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); - else - differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); - } - } - - // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering - for ( DependentSet dependent : differentAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - for ( DependentSet dependent : sameAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - return log10LofK; - } - - // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and - // also pushes its value to the given callingSetIndex. - private void updateACset(final int[] newSetCounts, - final int numChr, - final ExactACset dependentSet, - final int PLsetIndex, - final Queue ACqueue, - final HashMap indexesToACset, - final ArrayList genotypeLikelihoods) { - final ExactACcounts index = new ExactACcounts(newSetCounts); - if ( !indexesToACset.containsKey(index) ) { - ExactACset set = new ExactACset(numChr/2 +1, index); - indexesToACset.put(index, set); - ACqueue.add(set); - } - - // push data from the dependency to the new set - //if ( DEBUG ) - // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); - pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); - } - - private void computeLofK(final ExactACset set, - final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors) { - - set.getLog10Likelihoods()[0] = 0.0; // the zero case - final int totalK = set.getACsum(); - - // special case for k = 0 over all k - if ( totalK == 0 ) { - for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) - set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; - - final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); - getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - return; - } - - // if we got here, then k > 0 for at least one k. - // the non-AA possible conformations were already dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { - - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; - set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); - } - - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; - } - - double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - - // update the MLE if necessary - getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); - - // apply the priors over each alternate allele - for ( final int ACcount : set.getACcounts().getCounts() ) { - if ( ACcount > 0 ) - log10LofK += log10AlleleFrequencyPriors[ACcount]; - } - - getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); - } - - private void pushData(final ExactACset targetSet, - final ExactACset dependentSet, - final int PLsetIndex, - final ArrayList genotypeLikelihoods) { - final int totalK = targetSet.getACsum(); - - for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { - - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = - determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; - targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); - } - } - } - - private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - // the closed form representation generalized for multiple alleles is as follows: - // AA: (2j - totalK) * (2j - totalK - 1) - // AB: 2k_b * (2j - totalK) - // AC: 2k_c * (2j - totalK) - // BB: k_b * (k_b - 1) - // BC: 2 * k_b * k_c - // CC: k_c * (k_c - 1) - - // find the 2 alleles that are represented by this PL index - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** - // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** - - // the AX het case - if ( alleles.alleleIndex1 == 0 ) - return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; - - final int k_i = ACcounts[alleles.alleleIndex1-1]; - - // the hom var case (e.g. BB, CC, DD) - final double coeff; - if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { - coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; - } - // the het non-ref case (e.g. BC, BD, CD) - else { - final int k_j = ACcounts[alleles.alleleIndex2-1]; - coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; - } - - return coeff; - } - - public GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy) { - return allelesToUse.size() == 1 - ? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy) - : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, - assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java deleted file mode 100644 index 3d28db159..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; - -import java.util.ArrayList; - -/** - * Uses the Exact calculation of Heng Li - */ -abstract class ExactAFCalc extends AFCalc { - protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - - protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - } - - /** - * Wrapper class that compares two likelihoods associated with two alleles - */ - protected static final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele allele; - - public LikelihoodSum(Allele allele) { this.allele = allele; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } - - /** - * Unpack GenotypesContext into arraylist of doubel values - * @param GLs Input genotype context - * @return ArrayList of doubles corresponding to GL vectors - */ - protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size() + 1); - - if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { - if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); - - if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) - genotypeLikelihoods.add(gls); - } - } - - return genotypeLikelihoods; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java deleted file mode 100644 index f8c364e82..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ /dev/null @@ -1,636 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public class GeneralPloidyExactAFCalc extends ExactAFCalc { - static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them - - private final int ploidy; - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - private final static boolean VERBOSE = false; - - protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - this.ploidy = ploidy; - } - - @Override - protected VariantContext reduceScope(VariantContext vc) { - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) { - logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - final List alleles = new ArrayList(getMaxAltAlleles() + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy)); - - VariantContextBuilder builder = new VariantContextBuilder(vc); - builder.alleles(alleles); - builder.genotypes(subsetAlleles(vc, alleles, false, ploidy)); - return builder.make(); - } else { - return vc; - } - } - - @Override - public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); - return getResultFromFinalState(vc, log10AlleleFrequencyPriors); - } - - /** - * Simple wrapper class to hold values of combined pool likelihoods. - * For fast hashing and fast retrieval, there's a hash map that shadows main list. - * - */ - static class CombinedPoolLikelihoods { - private LinkedList alleleCountSetList; - private HashMap conformationMap; - private double maxLikelihood; - - - public CombinedPoolLikelihoods() { - // final int numElements = GenotypeLikelihoods.numLikelihoods(); - alleleCountSetList = new LinkedList(); - conformationMap = new HashMap(); - maxLikelihood = Double.NEGATIVE_INFINITY; - } - - public void add(ExactACset set) { - alleleCountSetList.add(set); - conformationMap.put(set.getACcounts(), set); - final double likelihood = set.getLog10Likelihoods()[0]; - - if (likelihood > maxLikelihood ) - maxLikelihood = likelihood; - - } - - public boolean hasConformation(int[] ac) { - return conformationMap.containsKey(new ExactACcounts(ac)); - - } - - public double getLikelihoodOfConformation(int[] ac) { - return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0]; - } - - public double getGLOfACZero() { - return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list - } - - public int getLength() { - return alleleCountSetList.size(); - } - } - - /** - * - * Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles - * @param vc Input variant context - * @param numAllelesToChoose Number of alleles to choose - * @param ploidy Ploidy per pool - * @return list of numAllelesToChoose most likely alleles - */ - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes(), false); - for ( final double[] likelihoods : GLs ) { - - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); - // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele - for (int k=1; k < acCount.length;k++) { - if (acCount[k] > 0) - likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]); - - } - } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; - } - - - /** - * Simple non-optimized version that combines GLs from several pools and produces global AF distribution. - * @param GLs Inputs genotypes context with per-pool GLs - * @param numAlleles Number of alternate alleles - * @param ploidyPerPool Number of samples per pool - * @param log10AlleleFrequencyPriors Frequency priors - */ - protected void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors) { - - final ArrayList genotypeLikelihoods = getGLs(GLs, true); - - - int combinedPloidy = 0; - - // Combine each pool incrementally - likelihoods will be renormalized at each step - CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods(); - - // first element: zero ploidy, e.g. trivial degenerate distribution - final int[] zeroCounts = new int[numAlleles]; - final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); - set.getLog10Likelihoods()[0] = 0.0; - - combinedPoolLikelihoods.add(set); - - if ( genotypeLikelihoods.size() <= 1 ) { - // no meaningful GLs at all, just set the tracker to non poly values - getStateTracker().reset(); // just mimic-ing call below - getStateTracker().setLog10LikelihoodOfAFzero(0.0); - } else { - for (int p=1; p ACqueue = new LinkedList(); - // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(); - final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods(); - - // add AC=0 to the queue - final int[] zeroCounts = new int[numAlleles]; - final int newPloidy = originalPloidy + newGLPloidy; - zeroCounts[0] = newPloidy; - - ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts)); - - ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.getACcounts(), zeroSet); - - // keep processing while we have AC conformations that need to be calculated - while ( !ACqueue.isEmpty() ) { - getStateTracker().incNEvaluations(); - // compute log10Likelihoods - final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); - - // clean up memory - indexesToACset.remove(ACset.getACcounts()); - if ( VERBOSE ) - System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); - - } - return newPool; - } - - // todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods - /** - * - * @param set ExactACset holding conformation to be computed - * @param newPool New pool likelihood holder - * @param originalPool Original likelihood holder - * @param newGL New pool GL vector to combine - * @param log10AlleleFrequencyPriors Prior object - * @param originalPloidy Total ploidy of original combined pool - * @param newGLPloidy Ploidy of GL vector - * @param ACqueue Queue of conformations to compute - * @param indexesToACset AC indices of objects in queue - * @return max log likelihood - */ - private double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final LinkedList ACqueue, - final HashMap indexesToACset) { - - // compute likeihood in "set" of new set based on original likelihoods - final int numAlleles = set.getACcounts().getCounts().length; - final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); - - - // add to new pool - if (!Double.isInfinite(log10LofK)) - newPool.add(set); - - // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { - return log10LofK; - } - - // iterate over higher frequencies if possible - // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. - // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space - final int ACwiggle = set.getACcounts().getCounts()[0]; - if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies - return log10LofK; - - - // add conformations for other cases - for ( int allele = 1; allele < numAlleles; allele++ ) { - final int[] ACcountsClone = set.getACcounts().getCounts().clone(); - ACcountsClone[allele]++; - // is this a valid conformation? - int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; - ACcountsClone[0] = newPloidy - altSum; - if (ACcountsClone[0] < 0) - continue; - - - GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); - } - - - return log10LofK; - } - - -// /** -// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. -// * Math is generalization of biallelic combiner. -// * -// * For vector K representing an allele count conformation, -// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) -// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) -// * @param originalPool First log-likelihood pool GL vector -// * @param yy Second pool GL vector -// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) -// * @param ploidy2 Ploidy of second pool -// * @param numAlleles Number of alleles -// * @param log10AlleleFrequencyPriors Array of biallelic priors -// * @param resultTracker Af calculation result object -// */ -// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, -// final double[] log10AlleleFrequencyPriors, -// final AFCalcResultTracker resultTracker) { -///* -// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); -// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); -// -// if (dim1 != originalPool.getLength() || dim2 != yy.length) -// throw new ReviewedStingException("BUG: Inconsistent vector length"); -// -// if (ploidy2 == 0) -// return; -// -// final int newPloidy = ploidy1 + ploidy2; -// -// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) -// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) -// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); -// final double[] x = originalPool.getLikelihoodsAsVector(true); -// while(firstIterator.hasNext()) { -// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); -// firstIterator.next(); -// } -// -// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); -// final double[] y = yy.clone(); -// while(secondIterator.hasNext()) { -// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); -// secondIterator.next(); -// } -// -// // initialize output to -log10(choose(m1+m2,[k1 k2...]) -// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); -// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); -// -// -// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K -// while(outputIterator.hasNext()) { -// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); -// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); -// -// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); -// outputIterator.next(); -// } -//*/ -// } - - /** - * Compute likelihood of a particular AC conformation and update AFresult object - * @param set Set of AC counts to compute - * @param firstGLs Original pool likelihoods before combining - * @param secondGL New GL vector with additional pool - * @param log10AlleleFrequencyPriors Allele frequency priors - * @param numAlleles Number of alleles (including ref) - * @param ploidy1 Ploidy of original pool (combined) - * @param ploidy2 Ploidy of new pool - * @return log-likehood of requested conformation - */ - private double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2) { - - final int newPloidy = ploidy1 + ploidy2; - - // sanity check - int totalAltK = set.getACsum(); - if (newPloidy != totalAltK) - throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); - - totalAltK -= set.getACcounts().getCounts()[0]; - // totalAltK has sum of alt alleles of conformation now - - - // special case for k = 0 over all k - if ( totalAltK == 0 ) { // all-ref case - final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; - set.getLog10Likelihoods()[0] = log10Lof0; - - getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); - getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - return log10Lof0; - - } else { - - // initialize result with denominator - // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. - // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i - - int[] currentCount = set.getACcounts().getCounts(); - double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); - - // for current conformation, get all possible ways to break vector K into two components G1 and G2 - final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY; - while (innerIterator.hasNext()) { - // check if breaking current conformation into g1 and g2 is feasible. - final int[] acCount2 = innerIterator.getCurrentVector(); - final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2); - final int idx2 = innerIterator.getLinearIndex(); - // see if conformation is valid and if original pool had this conformation - // for conformation to be valid, all elements of g2 have to be <= elements of current AC set - if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) { - final double gl2 = secondGL[idx2]; - if (!Double.isInfinite(gl2)) { - final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1); - final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1); - final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); - final double sum = firstGL + gl2 + num1 + num2; - - set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum); - } - } - innerIterator.next(); - } - - set.getLog10Likelihoods()[0] += denom; - } - - double log10LofK = set.getLog10Likelihoods()[0]; - - // update the MLE if necessary - final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); - - // apply the priors over each alternate allele - for (final int ACcount : altCounts ) { - if ( ACcount > 0 ) - log10LofK += log10AlleleFrequencyPriors[ACcount]; - } - // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); - - return log10LofK; - } - - /** - * Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy? - * @param set AC conformation vector - * @param ploidy Ploidy of set - * @return Valid conformation - */ - private static boolean isValidConformation(final int[] set, final int ploidy) { - int sum=0; - for (final int ac: set) { - if (ac < 0) - return false; - sum += ac; - - } - - return (sum == ploidy); - } - - /** - * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, - * including updating the PL's, and assign genotypes accordingly - * @param vc variant context with alleles and genotype likelihoods - * @param allelesToUse alleles to subset - * @param assignGenotypes true: assign hard genotypes, false: leave as no-call - * @param ploidy number of chromosomes per sample (pool) - * @return GenotypesContext with new PLs - */ - public GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy) { - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - List NO_CALL_ALLELES = new ArrayList(ploidy); - - for (int k=0; k < ploidy; k++) - NO_CALL_ALLELES.add(Allele.NO_CALL); - - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int numNewAltAlleles = allelesToUse.size() - 1; - - - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - if ( !g.hasLikelihoods() ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - - // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization - // and subsetting - if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - } - else { - final GenotypeBuilder gb = new GenotypeBuilder(g); - - if ( numNewAltAlleles == 0 ) - gb.noPL(); - else - gb.PL(newLikelihoods); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) - gb.alleles(NO_CALL_ALLELES); - else - assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); - newGTs.add(gb.make()); - } - } - - return newGTs; - - } - - /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs - * - * @param newLikelihoods the PL array - * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) - * @param numChromosomes Number of chromosomes per pool - * - * @return genotype - */ - private void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { - final int numNewAltAlleles = allelesToUse.size() - 1; - - - - // find the genotype with maximum likelihoods - final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - - final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); - final ArrayList alleleFreqs = new ArrayList(); - final ArrayList alleleCounts = new ArrayList(); - - - for (int k=1; k < mlAlleleCount.length; k++) { - alleleCounts.add(mlAlleleCount[k]); - final double freq = (double)mlAlleleCount[k] / (double)numChromosomes; - alleleFreqs.add(freq); - - } - - // per-pool logging of AC and AF - gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); - gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); - - // remove PLs if necessary - if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) - gb.noPL(); - - ArrayList myAlleles = new ArrayList(); - - // add list of called ML genotypes to alleles list - // TODO - too unwieldy? - int idx = 0; - for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) { - for (int k=0; k < mlAlleleCount[mlind]; k++) - myAlleles.add(idx++,allelesToUse.get(mlind)); - } - gb.alleles(myAlleles); - - if ( numNewAltAlleles > 0 ) - gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java deleted file mode 100644 index af5c79230..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ /dev/null @@ -1,426 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -/** - * Computes the conditional bi-allelic exact results - * - * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: - * - * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] - * - * it then computes the conditional probability on AF_c == 0: - * - * (2) P(D | AF_t > 0 && AF_c == 0) - * - * Thinking about this visually, we have the following likelihood matrix where each cell is - * the P(D | AF_c == i && AF_t == j): - * - * 0 AF_c > 0 - * ----------------- - * 0 | | - * |--|------------- - * a | | - * f | | - * _ | | - * t | | - * > | | - * 0 | | - * - * What we really want to know how - * - * (3) P(D | AF_c == 0 & AF_t == 0) - * - * compares with - * - * (4) P(D | AF_c > 0 || AF_t > 0) - * - * This is effectively asking for the value in the upper left vs. the sum of all cells. - * - * This class implements the conditional likelihoods summation for any number of alt - * alleles, where each alt allele has its EXACT probability of segregating calculated by - * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: - * - * Suppose we have for a A/B/C site the following GLs: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B - * - * XX = AA + AC + CC (since X = A or C) - * XB = AB + BC - * BB = BB - * - * After each allele has its probability calculated we compute the joint posterior - * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i - * prior for the ith least likely allele. - */ - public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { - /** - * The min. confidence of an allele to be included in the joint posterior. - */ - private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); - - private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; - private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - - /** - * Sorts AFCalcResults by their posteriors of AF > 0, so the - */ - private final static class CompareAFCalcResultsByPNonRef implements Comparator { - @Override - public int compare(AFCalcResult o1, AFCalcResult o2) { - return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); - } - } - - private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); - - /** - * The AFCalc model we are using to do the bi-allelic computation - */ - final AFCalc biAlleleExactModel; - - protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); - } - - /** - * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call - */ - private static class MyAFCalcResult extends AFCalcResult { - /** - * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call - */ - final List supporting; - - private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); - this.supporting = supporting; - } - } - - @Override - public AFCalcResult computeLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { - final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); - - if ( independentResultTrackers.size() == 0 ) - throw new IllegalStateException("Independent alleles model returned an empty list of results at VC " + vc); - - if ( independentResultTrackers.size() == 1 ) { - // fast path for the very common bi-allelic use case - return independentResultTrackers.get(0); - } else { - // we are a multi-allelic, so we need to actually combine the results - final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); - return combineIndependentPNonRefs(vc, withMultiAllelicPriors); - } - } - - /** - * Compute the conditional exact AFCalcResult for each allele in vc independently, returning - * the result of each, in order of the alt alleles in VC - * - * @param vc the VariantContext we want to analyze, with at least 1 alt allele - * @param log10AlleleFrequencyPriors the priors - * @return a list of the AFCalcResults for each bi-allelic sub context of vc - */ - @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) - @Ensures("goodIndependentResult(vc, result)") - protected final List computeAlleleIndependentExact(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { - final List results = new LinkedList(); - - for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { - final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); - results.add(resultTracker); - } - - return results; - } - - /** - * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results - */ - private static boolean goodIndependentResult(final VariantContext vc, final List results) { - if ( results.size() != vc.getNAlleles() - 1) return false; - for ( int i = 0; i < results.size(); i++ ) { - if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) - return false; - if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) - return false; - } - - return true; - } - - /** - * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order - * - * @param vc the variant context to split. Must have n.alt.alleles > 1 - * @return a bi-allelic variant context for each alt allele in vc - */ - @Requires({"vc != null", "vc.getNAlleles() > 1"}) - @Ensures("result.size() == vc.getNAlleles() - 1") - protected final List makeAlleleConditionalContexts(final VariantContext vc) { - final int nAltAlleles = vc.getNAlleles() - 1; - - if ( nAltAlleles == 1 ) { - // fast path for bi-allelic case. - return Collections.singletonList(vc); - } else { - // go through the work of ripping up the VC into its biallelic components - final List vcs = new LinkedList(); - - for ( int altI = 0; altI < nAltAlleles; altI++ ) { - vcs.add(biallelicCombinedGLs(vc, altI + 1)); - } - - return vcs; - } - } - - /** - * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex - * - * @param rootVC the root (potentially multi-allelic) variant context - * @param altAlleleIndex index of the alt allele, from 0 == first alt allele - * @return a bi-allelic variant context based on rootVC - */ - @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) - @Ensures({"result.isBiallelic()"}) - protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { - if ( rootVC.isBiallelic() ) { - return rootVC; - } else { - final int nAlts = rootVC.getNAlleles() - 1; - final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); - for ( final Genotype g : rootVC.getGenotypes() ) - biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts)); - - final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); - final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); - vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); - vcb.genotypes(biallelicGenotypes); - return vcb.make(); - } - } - - /** - * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case - * - * This is handled in the following way: - * - * Suppose we have for a A/B/C site the following GLs: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B - * - * XX = AA + AC + CC (since X = A or C) - * XB = AB + BC - * BB = BB - * - * @param original the original multi-allelic genotype - * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 - * @param nAlts the total number of alt alleles - * @return a new biallelic genotype with appropriate PLs - */ - @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) - @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) - protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { - if ( original.isNonInformative() ) - return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); - - if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); - - final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); - final double[] biAllelicPr = new double[3]; - - for ( int index = 0; index < normalizedPr.length; index++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); - - if ( pair.alleleIndex1 == altIndex ) { - if ( pair.alleleIndex2 == altIndex ) - // hom-alt case - biAllelicPr[2] = normalizedPr[index]; - else - // het-alt case - biAllelicPr[1] += normalizedPr[index]; - } else { - if ( pair.alleleIndex2 == altIndex ) - // het-alt case - biAllelicPr[1] += normalizedPr[index]; - else - // hom-non-alt - biAllelicPr[0] += normalizedPr[index]; - } - } - - final double[] GLs = new double[3]; - for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); - - return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); - } - - protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { - final ArrayList sorted = new ArrayList(conditionalPNonRefResults); - - // sort the results, so the most likely allele is first - Collections.sort(sorted, compareAFCalcResultsByPNonRef); - - double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); - final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); - - for ( int i = 0; i < sorted.size(); i++ ) { - if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) - throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); - - final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; - final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); - final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; - - // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); - } - - return sorted; - } - - - /** - * Take the independent estimates of pNonRef for each alt allele and combine them into a single result - * - * Given n independent calculations for each of n alternate alleles create a single - * combined AFCalcResult with: - * - * priors for AF == 0 equal to theta^N for the nth least likely allele - * posteriors that reflect the combined chance that any alleles are segregating and corresponding - * likelihoods - * combined MLEs in the order of the alt alleles in vc - * - * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently - */ - protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, - final List sortedResultsWithThetaNPriors) { - int nEvaluations = 0; - final int nAltAlleles = sortedResultsWithThetaNPriors.size(); - final int[] alleleCountsOfMLE = new int[nAltAlleles]; - final double[] log10PriorsOfAC = new double[2]; - final Map log10pRefByAllele = new HashMap(nAltAlleles); - - // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs - double log10PosteriorOfACEq0Sum = 0.0; - double log10PosteriorOfACGt0Sum = 0.0; - - boolean anyPoly = false; - for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { - final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); - final int altI = vc.getAlleles().indexOf(altAllele) - 1; - - // MLE of altI allele is simply the MLE of this allele in altAlleles - alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); - - // the AF > 0 case requires us to store the normalized likelihood for later summation - if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { - anyPoly = true; - log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); - log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); - } - - log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); - - // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); - - // trivial -- update the number of evaluations - nEvaluations += sortedResultWithThetaNPriors.nEvaluations; - } - - // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation - if ( ! anyPoly ) { - log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); - } - - // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, - // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently - // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 - // - // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we - // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where - // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO - final double log10PosteriorOfACGt0; - if ( log10PosteriorOfACEq0Sum == 0.0 ) - log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; - else - log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); - - final double[] log10LikelihoodsOfAC = new double[] { - // L + prior = posterior => L = poster - prior - log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], - log10PosteriorOfACGt0 - log10PriorsOfAC[1] - }; - - return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), - // priors incorporate multiple alt alleles, must be normalized - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), - log10pRefByAllele, sortedResultsWithThetaNPriors); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java deleted file mode 100644 index f1db5bcd7..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java +++ /dev/null @@ -1,151 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.LinkedList; -import java.util.List; -import java.util.TreeSet; - -/** - * Trim down an active region based on a set of variants found across the haplotypes within the region - * - * User: depristo - * Date: 4/27/13 - * Time: 2:10 PM - */ -class ActiveRegionTrimmer { - private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class); - private final boolean logTrimming; - private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping; - private final GenomeLocParser parser; - - /** - * Create a new ActiveRegionTrimmer - * - * @param logTrimming should we log our trimming events? - * @param snpPadding how much bp context should we ensure around snps? - * @param nonSnpPadding how much bp context should we ensure around anything not a snp? - * @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the - * origin active region in order to properly genotype events in the - * non-extended active region? - * @param parser a genome loc parser so we can create genome locs - */ - ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) { - if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding); - if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding); - if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping); - if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); - - logger.debug("Trimmer created with parameters " + logTrimming + " " + snpPadding + " " + nonSnpPadding + " " + maxDistanceInExtensionForGenotyping); - this.logTrimming = logTrimming; - this.snpPadding = snpPadding; - this.nonSnpPadding = nonSnpPadding; - this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping; - this.parser = parser; - } - - /** - * Trim down the active region to a region large enough to properly genotype the events found within the active - * region span, excluding all variants that only occur within its extended span. - * - * This function merely creates the region, but it doesn't populate the reads back into the region. - * - * @param region our full active region - * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position - * @param emitReferenceConfidence are we going to estimate the reference confidence with this active region? - * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully - */ - public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) { - - if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region - return null; - - final List withinActiveRegion = new LinkedList<>(); - boolean foundNonSnp = false; - GenomeLoc trimLoc = null; - for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { - final GenomeLoc vcLoc = parser.createGenomeLoc(vc); - if ( region.getLocation().overlapsP(vcLoc) ) { - if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding - foundNonSnp = true; - trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); - withinActiveRegion.add(vc); - } - } - final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding ); - - // we don't actually have anything in the region after removing variants that don't overlap the region's full location - if ( trimLoc == null ) return null; - -// final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); - // Try to have one kmer before and after any event. - - final GenomeLoc regionLoc = region.getLocation(); - final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); - final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad); - final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); - - final ActiveRegion trimmedRegion = region.trim(finalSpan); - if ( logTrimming ) { - logger.info("events : " + withinActiveRegion); - logger.info("region : " + regionLoc); - logger.info("trimLoc : " + trimLoc); - logger.info("pad : " + pad); - logger.info("idealSpan : " + idealSpan); - logger.info("maxSpan : " + maxSpan); - logger.info("finalSpan : " + finalSpan); - logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); - } - return trimmedRegion; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java deleted file mode 100644 index 091c09e8d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java +++ /dev/null @@ -1,466 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.haplotype.Haplotype; - -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.*; - -/** - * Collection of read assembly using several kmerSizes. - * - *

- * There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those - * assemblies. - *

- * - *

- * Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one. - *

- * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> - */ -public class AssemblyResultSet { - - private final Map assemblyResultByKmerSize; - private final Set haplotypes; - private final Map assemblyResultByHaplotype; - private ActiveRegion regionForGenotyping; - private byte[] fullReferenceWithPadding; - private GenomeLoc paddedReferenceLoc; - private boolean variationPresent; - private Haplotype refHaplotype; - private boolean wasTrimmed = false; - private final CountSet kmerSizes; - - /** - * Constructs a new empty assembly result set. - */ - public AssemblyResultSet() { - assemblyResultByKmerSize = new LinkedHashMap<>(4); - haplotypes = new LinkedHashSet<>(10); - assemblyResultByHaplotype = new LinkedHashMap<>(10); - kmerSizes = new CountSet(4); - } - - /** - * Trims an assembly result set down based on a new set of trimmed haplotypes. - * - * @param originalByTrimmedHaplotypes map from trimmed to original haplotypes. - * @param trimmedActiveRegion the trimmed down active region. - * - * @throws NullPointerException if any argument in {@code null} or - * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys. - * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones. - * - * - * @return never {@code null}, a new trimmed assembly result set. - */ - public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion, - final Map originalByTrimmedHaplotypes) { - if (refHaplotype == null) throw new IllegalStateException(); - if (trimmedActiveRegion == null) throw new NullPointerException(); - final AssemblyResultSet result = new AssemblyResultSet(); - - for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) { - final Haplotype original = originalByTrimmedHaplotypes.get(trimmed); - if (original == null) - throw new NullPointerException("all trimmed haplotypes must have an original one"); - final AssemblyResult as = assemblyResultByHaplotype.get(original); - if (as == null) result.add(trimmed); else result.add(trimmed, as); - } - - result.setRegionForGenotyping(trimmedActiveRegion); - result.setFullReferenceWithPadding(this.fullReferenceWithPadding); - result.setPaddedReferenceLoc(this.paddedReferenceLoc); - if (result.refHaplotype == null) - throw new IllegalStateException("missing reference haplotype in the trimmed set"); - result.wasTrimmed = true; - return result; - } - - /** - * Query the reference haplotype in the result set. - * @return {@code null} if none wasn't yet added, otherwise a reference haplotype. - */ - public Haplotype getReferenceHaplotype() { - return refHaplotype; - } - - /** - * Checks whether there is any variation present in the assembly result set. - * - *

- * This is equivalent to whether there is more than one haplotype. - *

- * - * @return {@code true} if there is variation present, {@code false} otherwise. - */ - public boolean isVariationPresent() { - return variationPresent && haplotypes.size() > 1; - } - - /** - * Dumps debugging information into a print-writer. - * - * @param pw where to dump the information. - * - * @throws NullPointerException if {@code pw} is {@code null}. - */ - public void debugDump(final PrintWriter pw) { - if (getHaplotypeList().size() == 0) { - return; - } - pw.println("Active Region " + this.regionForGenotyping.getLocation()); - pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc()); - pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation()); - pw.println("Haplotype count " + haplotypes.size()); - final Map kmerSizeToCount = new HashMap<>(); - - for (final Map.Entry e : assemblyResultByHaplotype.entrySet()) { - final AssemblyResult as = e.getValue(); - final int kmerSize = as.getGraph().getKmerSize(); - if (kmerSizeToCount.containsKey(kmerSize)) { - kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1); - } else { - kmerSizeToCount.put(kmerSize,1); - } - } - pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() ); - Integer[] kmerSizes = new Integer[kmerSizeToCount.size()]; - kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes); - Arrays.sort(kmerSizes); - pw.println("Kmer sizes values " + Arrays.toString(kmerSizes)); - for (int size : kmerSizes) { - pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size)); - } - } - - /** - * Adds a haplotype to the result set without indicating a generating assembly result. - * - *

- * It is possible to call this method with the same haplotype several times. In that the second and further - * calls won't have any effect (thus returning {@code false}). - *

- * - * @param h the haplotype to add to the assembly result set. - * - * @throws NullPointerException if {@code h} is {@code null} - * @throws IllegalArgumentException if {@code h} does not have a genome location. - * - * @return {@code true} if the assembly result set has been modified as a result of this call. - */ - public boolean add(final Haplotype h) { - if (h == null) throw new NullPointerException("input haplotype cannot be null"); - if (h.getGenomeLocation() == null) - throw new IllegalArgumentException("the haplotype provided must have a genomic location"); - if (haplotypes.contains(h)) - return false; - haplotypes.add(h); - updateReferenceHaplotype(h); - return true; - } - - /** - * Adds simultaneously a haplotype and the generating assembly-result. - * - *

- * Haplotypes and their assembly-result can be added multiple times although just the first call will have - * any effect (return value is {@code true}). - *

- * - * - * @param h haplotype to add. - * @param ar assembly-result that is assumed to have given rise to that haplotype. - * - * @throws NullPointerException if {@code h} or {@code ar} is {@code null}. - * @throws IllegalArgumentException if {@code h} has not defined genome location. - * - * @return {@code true} iff this called changes the assembly result set. - */ - public boolean add(final Haplotype h, final AssemblyResult ar) { - if (h == null) throw new NullPointerException("input haplotype cannot be null"); - if (ar == null) throw new NullPointerException("input assembly-result cannot be null"); - if (h.getGenomeLocation() == null) - throw new IllegalArgumentException("the haplotype provided must have a genomic location"); - - final boolean assemblyResultAdditionReturn = add(ar); - - if (haplotypes.contains(h)) { - final AssemblyResult previousAr = assemblyResultByHaplotype.get(h); - if (previousAr == null) { - assemblyResultByHaplotype.put(h, ar); - return true; - } else if (!previousAr.equals(ar)) - throw new IllegalStateException("there is already a different assembly result for the input haplotype"); - else - return assemblyResultAdditionReturn; - } else { - haplotypes.add(h); - assemblyResultByHaplotype.put(h,ar); - updateReferenceHaplotype(h); - if (h.isNonReference()) variationPresent = true; - return true; - } - } - - /** - * Add a assembly-result object. - * - * @param ar the assembly result to add. - * - * @throws NullPointerException if {@code ar} is {@code null}. - * @throws IllegalStateException if there is an assembly result with the same kmerSize. - * @return {@code true} iff this addition changed the assembly result set. - */ - public boolean add(final AssemblyResult ar) { - if (ar == null) - throw new NullPointerException(); - final int kmerSize = ar.getKmerSize(); - if (assemblyResultByKmerSize.containsKey(kmerSize)) { - if (!assemblyResultByKmerSize.get(kmerSize).equals(ar)) - throw new IllegalStateException("a different assembly result with the same kmerSize was already added"); - return false; - } else { - assemblyResultByKmerSize.put(kmerSize, ar); - kmerSizes.add(kmerSize); - return true; - } - } - - /** - * Returns the current region for genotyping. - * - * @return might be {@code null}. - */ - public ActiveRegion getRegionForGenotyping() { - return regionForGenotyping; - } - - /** - * Sets the region for genotyping. - * - * @param regionForGenotyping the new value. - */ - public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) { - this.regionForGenotyping = regionForGenotyping; - } - - /** - * Returns the current full reference with padding. - * - * @return might be {@code null}. - */ - public byte[] getFullReferenceWithPadding() { - return fullReferenceWithPadding; - } - - /** - * Sets the full reference with padding base sequence. - * - * @param fullReferenceWithPadding the new value. - */ - public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) { - this.fullReferenceWithPadding = fullReferenceWithPadding; - } - - /** - * Returns the padded reference location. - * - * @return might be {@code null} - */ - public GenomeLoc getPaddedReferenceLoc() { - return paddedReferenceLoc; - } - - /** - * Changes the padded reference location. - * @param paddedReferenceLoc the new value. - */ - public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) { - this.paddedReferenceLoc = paddedReferenceLoc; - } - - /** - * Returns the number of haplotypes in the assembly result set. - * @return {@code 0} or greater. - */ - public int getHaplotypeCount() { - return haplotypes.size(); - } - - /** - * Returns the haplotypes as a list. - * - *

- * The result is unmodifiable. - *

- * - * @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly. - */ - public List getHaplotypeList() { - return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()])); - } - - /** - * Returns the maximum kmerSize available. - * - * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. - * - * @return greater than 0. - */ - public int getMaximumKmerSize() { - if (kmerSizes.size() == 0) - throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); - return kmerSizes.max(); - } - - /** - * Indicates whether there are more than one kmerSize in the set. - * - * @return {@code true} iff there is more than one kmerSize assembly in the set. - */ - public boolean hasMultipleKmerSizes() { - return kmerSizes.size() > 1; - } - - /** - * Returns the minimum kmerSize available. - * - * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. - * - * @return greater than 0. - */ - public int getMinimumKmerSize() { - if (kmerSizes.size() == 0) - throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); - return kmerSizes.min(); - } - - /** - * Returns a read-threading graph in the assembly set that has a particular kmerSize. - * - * @param kmerSize the requested kmerSize. - * - * @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize. - */ - public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) { - final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize); - if (assemblyResult == null) return null; - return assemblyResult.getThreadingGraph(); - } - - /** - * Checks whether this assembly result set was trimmed. - * - * @return {@code true} iff this assembly result set was trimmed. - */ - public boolean wasTrimmed() { - return wasTrimmed; - } - - /** - * Marks the assembly as not having variation even if it has more than one haplotype. - */ - public void resetVariationPresent() { - variationPresent = false; - } - - /** - * Dumps debugging information into a logger. - * - * @param logger where to dump the information. - * - * @throws NullPointerException if {@code logger} is {@code null}. - */ - public void debugDump(final Logger logger) { - final StringWriter sw = new StringWriter(); - final PrintWriter pw = new PrintWriter(sw); - debugDump(pw); - final String str = sw.toString(); - final String[] lines = str.split("\n"); - for (final String line : lines) { - if (line.isEmpty()) { - continue; - } - logger.debug(line); - } - } - - /** - * Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the - * reference haplotype and updates {@link #refHaplotype} accordingly. - * - *

- * This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes} - * I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference - * haplotype and this has already be set. This is the case even if the new haplotypes and the current reference - * are equal. - *

- * - * @param newHaplotype the new haplotype. - * @throws NullPointerException if {@code newHaplotype} is {@code null}. - * @throws IllegalStateException if there is already a reference haplotype. - */ - private void updateReferenceHaplotype(final Haplotype newHaplotype) { - if (!newHaplotype.isReference()) return; - if (refHaplotype == null) - refHaplotype = newHaplotype; - else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality. - throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different"); - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java deleted file mode 100644 index 697d162fd..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ /dev/null @@ -1,521 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.DefaultHashMap; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.EventMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public class GenotypingEngine { - private final static Logger logger = Logger.getLogger(GenotypingEngine.class); - - private final boolean DEBUG; - private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; - private final static List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied - private final VariantAnnotatorEngine annotationEngine; - private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; - - public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, - final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, - final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { - this.DEBUG = DEBUG; - this.annotationEngine = annotationEngine; - this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; - noCall.add(Allele.NO_CALL); - this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; - } - - /** - * Carries the result of a call to #assignGenotypeLikelihoods - */ - public static class CalledHaplotypes { - private final List calls; - private final Set calledHaplotypes; - - protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { - if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); - if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); - if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) - throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); - this.calls = calls; - this.calledHaplotypes = calledHaplotypes; - } - - /** - * Get the list of calls made at this location - * @return a non-null (but potentially empty) list of calls - */ - public List getCalls() { - return calls; - } - - /** - * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). - * @return a non-null set of haplotypes - */ - public Set getCalledHaplotypes() { - return calledHaplotypes; - } - } - - /** - * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute - * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling - * - * The list of samples we're working with is obtained from the haplotypeReadMap - * - * @param UG_engine UG Engine with basic input parameters - * @param haplotypes Haplotypes to assign likelihoods to - * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) - * @param perSampleFilteredReadList - * @param ref Reference bytes at active region - * @param refLoc Corresponding active region genome location - * @param activeRegionWindow Active window - * @param genomeLocParser GenomeLocParser - * @param activeAllelesToGenotype Alleles to genotype - * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes - */ - @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - @Ensures("result != null") - // TODO - can this be refactored? this is hard to follow! - public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype ) { - // sanity check input arguments - if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); - if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); - if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); - if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); - if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - - // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference - // that carry events among the haplotypes - final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); - - // Walk along each position in the key set and create each event to be outputted - final Set calledHaplotypes = new HashSet<>(); - final List returnCalls = new ArrayList<>(); - final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0); - - for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); - - if( eventsAtThisLoc.isEmpty() ) { continue; } - - // Create the event mapping object which maps the original haplotype events to the events present at just this locus - final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); - - // Sanity check the priority list for mistakes - final List priorityList = makePriorityList(eventsAtThisLoc); - - // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false); - if( mergedVC == null ) { continue; } - - if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { - // this is possible in GGA mode when the same event is represented in multiple input records - throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s)."); - } - final Map mergeMap = new LinkedHashMap<>(); - mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele - for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) { - mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function - } - - final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); - - if( DEBUG ) { - logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - } - - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); - - final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); - final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); - if( call != null ) { - final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : - convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); - final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); - - VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); - - if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); - } - - // maintain the set of all called haplotypes - for ( final Allele calledAllele : call.getAlleles() ) - calledHaplotypes.addAll(alleleMapper.get(calledAllele)); - - returnCalls.add( annotatedCall ); - } - } - } - return new CalledHaplotypes(returnCalls, calledHaplotypes); - } - - /** - * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts - * - * @param haplotypes the list of haplotypes we're working with - * @param haplotypeReadMap map from samples -> the per read allele likelihoods - * @param ref the reference bases (over the same interval as the haplotypes) - * @param refLoc the span of the reference bases - * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) - * @return - */ - private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, - final Map haplotypeReadMap, - final byte[] ref, - final GenomeLoc refLoc, - final List activeAllelesToGenotype) { - final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); - - if ( in_GGA_mode ) startPosKeySet.clear(); - - //cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently - if ( !in_GGA_mode ) { - // run the event merger if we're not in GGA mode - final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); - if ( mergedAnything ) - cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events - } - - if ( in_GGA_mode ) { - for( final VariantContext compVC : activeAllelesToGenotype ) { - startPosKeySet.add( compVC.getStart() ); - } - } - - return startPosKeySet; - } - - /** - * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view - * @param vcs a list of variant contexts - * @return the list of the sources of vcs in the same order - */ - private List makePriorityList(final List vcs) { - final List priorityList = new LinkedList<>(); - for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); - return priorityList; - } - - private List getVCsAtThisLocation(final List haplotypes, - final int loc, - final List activeAllelesToGenotype) { - // the overlapping events to merge into a common reference view - final List eventsAtThisLoc = new ArrayList<>(); - - if( activeAllelesToGenotype.isEmpty() ) { - for( final Haplotype h : haplotypes ) { - final EventMap eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - } - } - } else { // we are in GGA mode! - int compCount = 0; - for( final VariantContext compVC : activeAllelesToGenotype ) { - if( compVC.getStart() == loc ) { - int alleleCount = 0; - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - List alleleSet = new ArrayList<>(2); - alleleSet.add(compVC.getReference()); - alleleSet.add(compAltAllele); - final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; - // check if this event is already in the list of events due to a repeat in the input alleles track - final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); - boolean alreadyExists = false; - for( final VariantContext eventToTest : eventsAtThisLoc ) { - if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { - alreadyExists = true; - } - } - if( !alreadyExists ) { - eventsAtThisLoc.add(candidateEventToAdd); - } - alleleCount++; - } - } - compCount++; - } - } - - return eventsAtThisLoc; - } - - /** - * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele - * @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods - * @param mergedVC Input VC with event to genotype - * @return GenotypesContext object wrapping genotype objects with PLs - */ - @Requires({"alleleReadMap!= null", "mergedVC != null"}) - @Ensures("result != null") - private GenotypesContext calculateGLsForThisEvent( final Map alleleReadMap, final VariantContext mergedVC ) { - final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - for( final String sample : alleleReadMap.keySet() ) { - final int numHaplotypes = mergedVC.getAlleles().size(); - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); - } - return genotypes; - } - - private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, - final Map perSampleReadMap, - final Map> perSampleFilteredReadList, - final VariantContext call ) { - - final Map returnMap = new LinkedHashMap<>(); - final GenomeLoc callLoc = parser.createGenomeLoc(call); - for( final Map.Entry sample : perSampleReadMap.entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - - for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end... - for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) { - likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue()); - } - } - } - - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele allele : call.getAlleles() ) { - likelihoodMap.add(read, allele, 0.0); - } - } - } - - returnMap.put(sample.getKey(), likelihoodMap); - } - return returnMap; - } - - /** - * Removes symbolic events from list of haplotypes - * @param haplotypes Input/output list of haplotypes, before/after removal - */ - // TODO - split into input haplotypes and output haplotypes as not to share I/O arguments - @Requires("haplotypes != null") - protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { - final List haplotypesToRemove = new ArrayList<>(); - for( final Haplotype h : haplotypes ) { - for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { - if( vc.isSymbolic() ) { - for( final Haplotype h2 : haplotypes ) { - for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { - if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events - haplotypesToRemove.add(h); - break; - } - } - } - } - } - } - haplotypes.removeAll(haplotypesToRemove); - } - - // BUGBUG: ugh, too complicated - protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, - final Map> alleleMapper, - final Map perSampleDownsamplingFraction ) { - - final Map alleleReadMap = new LinkedHashMap<>(); - for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele - final List mappedHaplotypes = alleleMapperEntry.getValue(); - for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read - double maxLikelihood = Double.NEGATIVE_INFINITY; - for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele - if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey())) ) { // exact match of haplotype base string - maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); - } - } - perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); - } - } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling - alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); - } - - return alleleReadMap; - } - - protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { - final Map> alleleMapper = new LinkedHashMap<>(); - for( final Map.Entry entry : mergeMap.entrySet() ) { - alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey()))); - } - return alleleMapper; - } - - @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) - @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { - - final Map> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); - final Event refEvent = new Event(null); - eventMapper.put(refEvent, new ArrayList()); - for( final VariantContext vc : eventsAtThisLoc ) { - eventMapper.put(new Event(vc), new ArrayList()); - } - - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) == null ) { - eventMapper.get(refEvent).add(h); - } else { - for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { - if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - eventMapper.get(new Event(vcAtThisLoc)).add(h); - break; - } - } - } - } - - return eventMapper; - } - - @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) - protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final List> alleleMapper, final List haplotypes ) { - if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } - final List eventAllelesForSample = new ArrayList<>(); - for( final Allele a : haplotypeAllelesForSample ) { - final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); - for( int iii = 0; iii < alleleMapper.size(); iii++ ) { - final List mappedHaplotypes = alleleMapper.get(iii); - if( mappedHaplotypes.contains(haplotype) ) { - eventAllelesForSample.add(eventAlleles.get(iii)); - break; - } - } - } - return eventAllelesForSample; - } - - @Deprecated - protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { - return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); - } - - protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { - for( final VariantContext vc : list ) { - if( vc.hasSameAllelesAs(vcToTest) ) { - return true; - } - } - return false; - } - - protected static class Event { - public VariantContext vc; - - public Event( final VariantContext vc ) { - this.vc = vc; - } - - @Override - public boolean equals( final Object obj ) { - return obj instanceof Event && ((((Event) obj).vc == null && vc == null) || (((Event) obj).vc != null && vc != null && ((Event) obj).vc.hasSameAllelesAs(vc))) ; - } - - @Override - public int hashCode() { - return (vc == null ? -1 : vc.getAlleles().hashCode()); - } - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java deleted file mode 100644 index 9d861a445..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java +++ /dev/null @@ -1,911 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.*; - -/** - * Fast pseudo-likelihood calculation engine based on the assembly haplotype graph. - * - *

- * An instance is good for active region. {@link GraphBasedLikelihoodCalculationEngine} instance them on demand - * as requested by the {@code HaplotypeCaller} code. - *

- */ -public class GraphBasedLikelihoodCalculationEngineInstance { - - private final static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngineInstance.class); - - - /** - * Unified kmer size used for the Haplotype graph. - */ - protected final int kmerSize; - - /** - * Reference to the haplotype graph. - */ - protected final HaplotypeGraph haplotypeGraph; - - /** - * Haplotypes included in the haplotype graph. - */ - private final List haplotypes; - - /** - * Whether there is some variation present in the haplotype assembly. - */ - private final boolean hasVariation; - - - /** - * Counts of reads that anchoread somewhere. - * - *

Used for debugging purposes

- */ - private int anchoredReads = 0; - - /** - * Count of reads that didn't anchor anywere. - * - *

Used for debugging purposes

- */ - private int nonAnchoredReads = 0; - - /** - * Pair-hmm implementation to use to calculate read likelihoods. - */ - private final FlexibleHMM hmm; - - /** - * Maximum likelihood difference between the reference haplotype and the best alternative haplotype. - * - *

If the difference is greater for a read, the reference haplotype likelihood is increase in order to not go - * beyond this limit

- */ - protected final double log10globalReadMismappingRate; - - protected final EventBlockFinder eventBlockSearchEngine; - - - /** - * Constructs a new engine based on the results of the assembly. - * - * @param assemblyResultSet assembly-result set - * @param hmm fast-hmm implementation to use. - * @param log10globalReadMismappingRate maximum cost for the reference haplotype vs the best alternative available. - * @param heterogeneousKmerSizeResolution multi-kmersize dataset resolution. - * @throws NullPointerException if any argument is null. - * @throws IllegalArgumentException if log10globalReadMismappingRate >= 0. - */ - public GraphBasedLikelihoodCalculationEngineInstance(final AssemblyResultSet assemblyResultSet, final FlexibleHMM hmm, final double log10globalReadMismappingRate, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) { - if (heterogeneousKmerSizeResolution == null) throw new NullPointerException("the kmerSize resolution cannot be null"); - if (assemblyResultSet == null) throw new NullPointerException("the assembly result set cannot be null"); - if (hmm == null) throw new NullPointerException("the fast-hmm component cannot be null"); - if (log10globalReadMismappingRate >= 0) - throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero"); - - this.hmm = hmm; - this.log10globalReadMismappingRate = log10globalReadMismappingRate; - - haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList()); - Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); - - // make sure that kmerSize is not bigger than the smallest haplotype. It can well happen when there are cycles and kmerSize inflates. - final Haplotype referenceHaplotype = assemblyResultSet.getReferenceHaplotype(); - int minHaplotypeLength = referenceHaplotype.length(); - for (final Haplotype h : haplotypes) - if (minHaplotypeLength > h.length()) - minHaplotypeLength = h.length(); - - // Determine the kmerSize to use for the unified haplotype assembly graph - - kmerSize = Math.min(minHaplotypeLength, - heterogeneousKmerSizeResolution.useMaximum() ? assemblyResultSet.getMaximumKmerSize() : assemblyResultSet.getMinimumKmerSize()); - - haplotypeGraph = new HaplotypeGraph(kmerSize,haplotypes); - - - if (haplotypeGraph.hasCycles()) - Utils.warnUser(logger, "cycle caused at merging haplotypes with different kmerSizes: active region " + assemblyResultSet.getRegionForGenotyping() + " will be skipped"); - - //TODO haplpotypeGraph.getReferenceSourceVertex() == null - //TODO Is a quick patch to ignore cases where the trimming has rendered kmerSize so big that is bigger than the haplotype - //TODO and reduction to the minimum haplotype size result in no unique kmers. - //TODO the actual solution: we need to impose a maximum trimming at least for Graph-based HC runs as we are loosing - //TODO a bit of sensitivity as trimming results in lack of unique kmers. - if (haplotypeGraph.hasCycles() || haplotypeGraph.getReferenceHaplotype() == null) { - hasVariation = false; - eventBlockSearchEngine = null; - return; - } - - haplotypeGraph.mergeCommonChains(); - //TODO recover dangling ends. Did not work the last time I tried but may be worth to retry. - //haplotypeGraph.recoverDanglingTails(-1); - logger.debug("using haplotype graph with kmerSize " + haplotypeGraph.getKmerSize()); - - hasVariation = !haplotypeGraph.hasCycles() && haplotypeGraph.getHaplotypes().size() > 1; - - eventBlockSearchEngine = new EventBlockFinder(haplotypeGraph); - } - - /** - * Determines whether based on result from assembly and the relevant user options we can reuse th existing - * - * @param assemblyResultSet assembly result set. - * @param kmerSize intended kmerSize for the haplotype graph. - * @param heterogeneousKmerSizeResolution user instruction as to how to resolve situation where we have haplotypes comming from different kmer sizes. - * @return {@code true} iff we can reuse an existing read-threading graph with that kmerSize in the assembly result set. - */ - @SuppressWarnings("unused") - private static boolean canReuseReadThreadingGraphAsHaplotypeGraph(final AssemblyResultSet assemblyResultSet, final int kmerSize, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) { - return !assemblyResultSet.wasTrimmed() && (!assemblyResultSet.hasMultipleKmerSizes() || heterogeneousKmerSizeResolution.combinesKmerSizes()) && - assemblyResultSet.getUniqueReadThreadingGraph(kmerSize) != null; - } - - /** - * Checks whether the underlying haplotype graph assembly contains any variation worth analyzing. - * - * @return {@code true} iff so. - */ - public boolean hasVariation() { - return hasVariation; - } - - /** - * Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the - * active region assembly process. - * - * @param haplotypes to evaluate. - * @param perSampleReadList the input read sets stratified per sample. - * - * @throws NullPointerException if either parameter is {@code null}. - * - * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. - * The value maps can be potentially empty though. - */ - public Map computeReadLikelihoods( - final List haplotypes, - final Map> perSampleReadList) { - // General preparation on the input haplotypes: - Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); - final Map alleleVersions = new LinkedHashMap<>(haplotypes.size()); - for (final Haplotype haplotype : haplotypes) - alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference())); - - // The actual work: - final HashMap result = new HashMap<>(perSampleReadList.size()); - for (final Map.Entry> e : perSampleReadList.entrySet()) { - final String sample = e.getKey(); - final List reads = e.getValue(); - final Set mayNeedAdjustment = new HashSet<>(reads.size()); - // Get the cost/likelihood of each read at relevant subpaths on the tree: - final Map> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment); - // Create the resulting per-read maps: - final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions); - result.put(sample, prallm); - } - logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + ""); - return result; - } - - - /** - * Prints a graph into a dot file. - * - * @param fileName name of the output file. - */ - public void printGraph(final String fileName) { - if (haplotypeGraph != null) - haplotypeGraph.printGraph(fileName); - } - - /** - * Returns the kmerSize the engine is using to match read vs graph kmers thus reducing computation. - * - * @return greater than 0. - */ - public int getKmerSize() { - return kmerSize; - } - - /** - * Tells whether the underlying haplotype graph contained cycles. - * - * @return {@code true} iff so. - */ - public boolean hasCycles() { - // It is set to null if it contained cycles. - return haplotypeGraph == null; - } - - - /** - * Builds the result per-read allele likelihood map. - * - * @param haplotypes haplotypes to process. - * @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex. - * @param alleleVersions map between haplotypes and the corresponding allele. - * @return never {@code null} although perhaps empty. - */ - protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap( - final Collection haplotypes, - final Map> costsEndingByVertex, final Map alleleVersions) { - - final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); - if (haplotypeGraph == null) - return result; - final Map maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10); - final Set supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes()); - supportedHaplotypes.retainAll(haplotypes); - for (final Haplotype haplotype : supportedHaplotypes) - calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex); - - //TODO Does not seem to be needed in practice: - //TODO furhter testing/evaluation required before removing it completely. - //makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk); - applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk); - return result; - } - - /** - * Work done per haplotype to build the result per-read allele likelihood map. - *

- *

- * Basically for each haplotype we go through its path in the graph collecting all the read cost that we find - * on the way. For each read present we add up all its cost resulting in a single value per read, i.e. its - * "likelihood". - *

- * - * @param haplotype the target haplotype - * @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output. - * @param result target where to add the read-vs-haplotype likelihoods. - * @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read. - * @param costsEndingByVertex read costs assorted by their end vertex. - */ - private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype, - final Map alleleVersions, - final PerReadAlleleLikelihoodMap result, - final Map maxAlleleLogLk, - final Map> costsEndingByVertex) { - final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype); - final Set haplotypeVertices = haplotypeRoute.vertexSet(); - final Map readCostByRead = new HashMap<>(); - final Set visitedVertices = new HashSet<>(haplotypeVertices.size()); - final List edgeList = haplotypeRoute.getEdges(); - MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex(); - Route pathSoFar = new Route<>(currentVertex, haplotypeGraph); - final Iterator edgeIterator = edgeList.iterator(); - while (true) { - visitedVertices.add(currentVertex); - final Set finishingAtElementCostSet = costsEndingByVertex.get(currentVertex); - updateReadCosts(readCostByRead, visitedVertices, pathSoFar, finishingAtElementCostSet); - if (!edgeIterator.hasNext()) break; - final MultiSampleEdge nextEdge = edgeIterator.next(); - pathSoFar = new Route<>(pathSoFar, nextEdge); - currentVertex = pathSoFar.getLastVertex(); - } - - final List readCosts = new ArrayList<>(readCostByRead.values()); - Collections.sort(readCosts, ReadCost.COMPARATOR); - for (final ReadCost rc : readCosts) - result.add(rc.read, alleleVersions.get(haplotype), rc.cost); - - for (final ReadCost rc : readCosts) { - final Double currentMax = maxAlleleLogLk.get(rc.read); - if (currentMax == null || currentMax < rc.cost) - maxAlleleLogLk.put(rc.read, rc.cost); - } - } - - /** - * Update the read cost based on the path cost found at a vertex. - * - * @param readCosts collection of read costs so far - * @param visitedVertices visited vertices collection. - * @param pathSoFar the haplotype path visited so far. - * @param finishingAtElementCostSet collection of path cost to process - */ - private void updateReadCosts(final Map readCosts, - final Set visitedVertices, - final Route pathSoFar, - final Set finishingAtElementCostSet) { - if (finishingAtElementCostSet != null) { - for (final ReadSegmentCost pc : finishingAtElementCostSet) { - if (!visitedVertices.contains(pc.path.getFirstVertex())) - continue; - if (!pathSoFar.isSuffix(pc.path)) - continue; - ReadCost rc = readCosts.get(pc.read); - if (rc == null) - readCosts.put(pc.read, rc = new ReadCost(pc.read)); - rc.cost += pc.cost; - } - } - } - - /** - * Likelihood penalty for unreported haplotype vs read likelihood with respect to the worst reported one. - */ - private static final int UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY = -3; - - /** - * Re-scales all haplotype vs read likelihoods so that for read, the best haplotype, hash likelihood 0. - * - * @param alleleVersions map between input haplotypes and output alleles. - * @param result where to change the likelihoods. - * @param mayNeedAdjustment set of read that might need adjustment. Others might be ignored. - * @param maxAlternative map from each read and the maximum alternative haplotype likelihood. - */ - @SuppressWarnings("unused") - private void makeLikelihoodAdjustment(final Map alleleVersions, - final PerReadAlleleLikelihoodMap result, - final Set mayNeedAdjustment, - final Map maxAlternative) { - final Map> map = result.getLikelihoodReadMap(); - - for (final GATKSAMRecord read : mayNeedAdjustment) { - final Map existingLikelihoods = map.get(read); - if (existingLikelihoods != null) { - Allele bestAllele = null; - double worstRelativeLikelihood = 0; - double bestRelativeLikelihood = Double.NEGATIVE_INFINITY; - for (final Map.Entry entry : map.get(read).entrySet()) { - final double candidateRelativeLikelihood = entry.getValue(); - if (candidateRelativeLikelihood > bestRelativeLikelihood) { - bestAllele = entry.getKey(); - bestRelativeLikelihood = candidateRelativeLikelihood; - } - if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood) - worstRelativeLikelihood = candidateRelativeLikelihood; - } - - worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY; - if (bestAllele == null) - throw new IllegalStateException("No best allele for read " + read.getReadName()); - final double bestLikelihood = 0.0; // the best becomes zero. - maxAlternative.put(read, bestLikelihood); - for (final Map.Entry entry : alleleVersions.entrySet()) { - final Allele a = entry.getValue(); - final Double relativeLikelihoodO = existingLikelihoods.get(a); - final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO; - final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood; - if (likelihood > 0) - throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName()); - existingLikelihoods.put(a, likelihood); - } - } - } - } - - /** - * Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele. - * The justification of this constraint is explained in - * {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}. - * - * @param alleleVersions correspondence between input haplotypes and output alleles. - * @param result the target result map. - * @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele. - */ - private void applyGlobalReadMismappingRate(final Map alleleVersions, - final PerReadAlleleLikelihoodMap result, - final Map maxAlleleLogLk) { - if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) { - final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype()); - for (final Map.Entry> entry : result.getLikelihoodReadMap().entrySet()) { - final GATKSAMRecord read = entry.getKey(); - final Map likelihoods = entry.getValue(); - final Double maxLogLk = maxAlleleLogLk.get(read); - if (maxAlleleLogLk == null) continue; - final Double referenceLogLk = likelihoods.get(referenceAllele); - final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate; - if (referenceLogLk == null || referenceLogLk < minReferenceLogLk) - likelihoods.put(referenceAllele, minReferenceLogLk); - } - } - } - - /** - * Calculates path costs for a set of reads. - *

- *

- * The resulting map has one entry per read, where the read is the key and the value list of path-cost sets. - * Each element in that list corresponds to an event block. Each path cost in one of those sets indicate the - * likelihood (cost) of traversing a possible path across the event block using that read. - *

- * - * @param reads reads to analyze. - * @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment. - * @return never {@code null}. - */ - protected Map> calculatePathCostsByRead( - final List reads, final Set mayNeedAdjustment) { - final Map> result = new HashMap<>(reads.size()); - if (!hasVariation) - return Collections.emptyMap(); - for (final GATKSAMRecord r : reads) { - calculatePathCostsByRead(r, mayNeedAdjustment, result); - } - return result; - } - - /** - * Calculates path cost for a single read. - * - * @param read target read. - * @param mayNeedAdjustment set where to add read whose likelihood might need adjustment. - * @param result map where to add the result. - */ - private void calculatePathCostsByRead(final GATKSAMRecord read, final Set mayNeedAdjustment, - final Map> result) { - - final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph); - // cannot anchor so go the tradition pair-hmm way. - hmm.loadRead(read); - if (!anchoring.isAnchoredSomewhere()) { - defaultToRegularPairHMM(anchoring, result); - nonAnchoredReads++; - return; - } - - calculateReadSegmentCosts(anchoring, hmm, result); - - if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result); - mayNeedAdjustment.add(read); - anchoredReads++; - } - - /** - * Calculates read vs haplotype likelihoods using the classic PairHMM approach. - *

- *

- * It basically compares the read with each haplotype full path without short cuts. - *

- * - * @param anchoring anchoring information of the read. - * @param destination where to leave the results indexed by ending veretex. - */ - private void defaultToRegularPairHMM(final ReadAnchoring anchoring, final Map> destination) { - - for (final Map.Entry entry : haplotypeGraph.getHaplotypeRouteMap().entrySet()) { - if (entry.getValue() == null) continue; - final byte[] haplotypeBases = entry.getKey().getBases(); - hmm.loadHaplotypeBases(haplotypeBases); - final double cost = hmm.calculateLocalLikelihood(0, anchoring.read.getReadLength(), 0, haplotypeBases.length, false); - final ReadSegmentCost readSegmentCost = new ReadSegmentCost(anchoring.read, entry.getValue(), cost); - addReadSegmentCost(destination, readSegmentCost); - } - } - - /** - * Add a new read-segment-cost to an ending vertex indexed map. - * @param destination where to add the read-segment-cost. - * @param cost the read-segment-cost to add. - */ - private void addReadSegmentCost(final Map> destination, final ReadSegmentCost cost) { - final MultiDeBruijnVertex endVertex = cost.path.getLastVertex(); - Set vpcSet = destination.get(endVertex); - if (vpcSet == null) - destination.put(endVertex, vpcSet = new HashSet<>(10)); - vpcSet.add(cost); - } - - /** - * Calculate the likelihood cost of path section of a read across the graph. - *

- *

- * Given a read, its anchors and other unique kmer mapable to the reference path we can divide the graph - * into event blocks: a set of one or more variations and the possible path across that block. - *

- *

- *

- * The result value will have one element fo reach block. Each element is the set of all path costs (likelihoods) - * to traverse the block using all possible paths (different haplotypes). - *

- *

- *

- * The current implementation has some added complexity in order to avoid a situation in where the last part - * of the anchored section of the read is thrown out. We first determine the last event block boundaries and we - * make sure that we won't run over its left limit when covering for earlier event blocks. - *

- * - * @param anchoring target read graph anchoring information. - * @param hmm the pair-hmm calculation engine. It must have been loaded with the same {@code read} already. - * @param destination where to add the costs. - */ - private void calculateReadSegmentCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) { - - final EventBlockFinder.Traversal traversal = eventBlockSearchEngine.traversal(anchoring); - - for (final EventBlock eventBlock : traversal) { - - // final Set> acrossBlockPaths = - // calculateAllPathsBetweenVertices(anchoring, - // eventBlock.getSource(), eventBlock.getSink());//eventBlock.getRoutesAcross(); - - final Set> acrossBlockPaths = eventBlock.getRoutesAcross(); - - int leftBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSource()); - int rightBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSink()); - calculateCostForPathSet(anchoring.read, acrossBlockPaths, hmm, leftBlockBoundaryIndex, rightBlockBoundaryIndex, true, false, null, null, destination); - } - } - - /** - * Calculate path cost for a set of paths across a event block. - * - * @param read the target read. - * @param acrossBlockPaths event block paths to evaluate. - * @param hmm pair-hmm engine to use to calculate likelihoods. - * @param beforeBlockReadOffset kmer offset on the read for the vertex kmer before the block. - * @param afterBlockReadOffset kmer offset on the read for the vertex kmer after the block. - * @param doClipping whether to perform any clipping in order to save cpu time. - * @param prependVertex if not null, the end cost path with be prepended with this vertex. - * @param appendVertex if not null, the end cost path will be appended with this vertex. - * @param includePathEnds whether to include or exclude the vertices at the very end or beginning of the paths. - */ - private void calculateCostForPathSet( - final GATKSAMRecord read, final Set> acrossBlockPaths, - final FlexibleHMM hmm, final int beforeBlockReadOffset, final int afterBlockReadOffset, - final boolean doClipping, final boolean includePathEnds, - final MultiDeBruijnVertex prependVertex, - final MultiDeBruijnVertex appendVertex, - final Map> destination) { - - - final Set readSegmentCosts = new TreeSet<>(ReadSegmentComparator.INSTANCE); - - final int readStart = beforeBlockReadOffset + kmerSize; - final int readEnd = Math.max(readStart, afterBlockReadOffset + kmerSize - 1); - final byte[][] pathBases = new byte[acrossBlockPaths.size()][]; - final CountSet pathSizes = new CountSet(acrossBlockPaths.size()); - int nextPath = 0; - - // Complete the read segment cost with the corresponding path bases - for (final Route p : acrossBlockPaths) { - final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN); - pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds); - pathSizes.add(readSegmentCost.bases.length); - readSegmentCosts.add(readSegmentCost); - } - - // Add the read 'path size'. - pathSizes.add(readEnd - readStart); - - final byte[] readBases = hmm.getReadBases(); - - // Perform right clipping of bases that are common to all paths and read. - int rightClipping = !doClipping ? 0 : calculateRightClipping(readEnd, pathBases, readBases,pathSizes); - - // Calculate the costs. - for (final ReadSegmentCost readSegmentCost : readSegmentCosts) { - hmm.loadHaplotypeBases(readSegmentCost.bases); - readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false); - if (prependVertex != null) - readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path); - if (appendVertex != null) - readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex); - addReadSegmentCost(destination,readSegmentCost); - } - - - } - - /** - * Determines how much we can clip away from the right side of a set of path without loosing accuracy when comparing - * likelihood vs the read. - * - * @param readEnd exclusive position right after the last one of the region considered. - * @param pathBases bases of possible path in the same event block. - * @param readBases full length read bases. - * @param pathSizes path size set. - * - * @return 0 or greater. - */ - private int calculateRightClipping(final int readEnd, final byte[][] pathBases, - final byte[] readBases, final CountSet pathSizes) { - final int maxClipping = pathSizes.size() > 1 ? 0 : Math.min(pathSizes.min(), kmerSize - 1); - int rightClipping = 0; - while (rightClipping < maxClipping) { - final byte readBase = readBases[readEnd - rightClipping - 1]; - boolean dontGoFurther = false; - for (int i = 0; !dontGoFurther && i < pathBases.length; i++) - if (pathBases[i][pathBases[i].length - rightClipping - 1] != readBase) - dontGoFurther = true; - if (dontGoFurther) - break; - rightClipping++; - } - return rightClipping; - } - - /** - * Calculates a graph path bases. - *

- *

- * When the path starts on a source vertex, all its sequence is considered as part of the path bases. For regular - * vertices start only the suffix (last) base is considered. - *

- * - * @param path the targeted path. - * @param includePathEnds whether the bases included in the first and last vertex of the path should be included or excluded. - * @return never {@code null} but perhaps a zero-length base array if the final requested path length is zero. - */ - //TODO this method could be moved to the Path class, but require consider how to make the API more concise. - private byte[] eventBlockPathBases(final Path path, - final boolean includePathEnds) { - // We first calculate the size of the return. - final List vertices = path.getVertices(); - final boolean pathStartsAtSource = haplotypeGraph.isSource(path.getFirstVertex()); - final int resultLength = includePathEnds - ? vertices.size() + (pathStartsAtSource ? path.getFirstVertex().getSequence().length - 1 : 0) - : vertices.size() - 2; - // Trivial empty return cases: - if (resultLength <= 0) - return new byte[0]; - final byte[] result = new byte[resultLength]; - if (result.length == 0) { - return result; - } - // General return cases: - final ListIterator it = vertices.listIterator(includePathEnds ? 0 : 1); // skip the vertex (exclusive) - for (int i = 0; i < resultLength; i++) { // i < resultLength implicitly skips the last vertex (exclusive). - final MultiDeBruijnVertex vertex = it.next(); - if (i == 0 && includePathEnds && pathStartsAtSource) { - System.arraycopy(vertex.getSequence(), 0, result, 0, kmerSize); - i = kmerSize - 1; - } else - result[i] = vertex.getSuffix(); - } - return result; - } - - /** - * Calculate the path cost of dangling ends. - *

- *

- * A dangling end is the section of the read that falls before the left anchor or after the right anchor. - *

- * - * @param anchoring anchoring information of the read vs the haplotype assembly graph. - * @param hmm the PairHMM engine to use to calculate likelihoods. - * @param destination cost destination. - */ - private void danglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) { - if (anchoring.leftAnchorIndex > 0 || anchoring.leftAnchorIndex == 0 - && anchoring.leftAnchorVertex.hasAmbiguousSequence()) - leftDanglingEndPathCosts(anchoring, hmm,destination); - - if (anchoring.rightAnchorIndex < anchoring.read.getReadLength() - kmerSize) - rightDanglingEndPathCosts(anchoring, hmm, destination); - } - - /** - * Generates all relevant right dangling end path costs. - * - * @param anchoring the anchoring information for the read under analysis. - * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with - * the same read as {@code anchoring} refers to. - * @param destination where the place the resulting read-segment-costs. - */ - private void rightDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, - final Map> destination) { - final int readStart = anchoring.rightAnchorIndex; - final int readEnd = anchoring.read.getReadLength() - kmerSize + 1; - final Set> haplotypeRoutes = - extendsHaplotypeRoutesForwards(anchoring.rightAnchorVertex); - if (haplotypeRoutes.size() >= 2) - calculateCostForPathSet(anchoring.read, - haplotypeRoutes, hmm, readStart, readEnd, false, true,anchoring.rightAnchorVertex,null,destination); - - } - - /** - * Generates all relevant left dangling end path costs. - * - * @param anchoring the anchoring information for the read under analysis. - * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with - * the same read as {@code anchoring} refers to. - * @param destination where the place the resulting read-segment-costs. - */ - private void leftDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, - final Map> destination) { - final int readStart = -kmerSize; - final int readEnd = anchoring.leftAnchorIndex; - final Set> haplotypeRoutes = - extendsHaplotypeRoutesBackwards(anchoring.leftAnchorVertex); - if (haplotypeRoutes.size() >= 2) // if there is just one haplotype route there is no relevant variation in the dangling end. - calculateCostForPathSet(anchoring.read, haplotypeRoutes, hmm, - readStart, readEnd, false, true, null, anchoring.leftAnchorVertex, destination); - } - - /** - * Construct haplotype routes prefixes to an anchor vertex. - *

- *

- * The output should contain a route for each haplotype that includes the input anchor vertex. - * This route would be the prefix of the haplotype that finishes at that vertex. - *

- * - * @param anchorVertex the target anchor vertex. - * @return never {@code null}. - */ - private Set> extendsHaplotypeRoutesBackwards( - final MultiDeBruijnVertex anchorVertex) { - final Set> result = new HashSet<>(haplotypes.size()); - for (final MultiDeBruijnVertex parent : haplotypeGraph.incomingVerticesOf(anchorVertex)) - extendsHaplotypeRoutesFrom(parent, result, false); - return result; - } - - /** - * Construct haplotype routes suffix from an anchor vertex. - *

- *

- * The output should contain a route for each haplotype that includes the input anchor vertex. - * This route would be the suffix of the haplotype that starts at that vertex. - *

- * - * @param anchorVertex the target anchor vertex. - * @return never {@code null}. - */ - private Set> extendsHaplotypeRoutesForwards( - final MultiDeBruijnVertex anchorVertex) { - final Set> result = new HashSet<>(haplotypes.size()); - for (final MultiDeBruijnVertex parent : haplotypeGraph.outgoingVerticesOf(anchorVertex)) - extendsHaplotypeRoutesFrom(parent, result, true); - return result; - } - - /** - * Extends from a vertex considering path furcations that are part of some valid haplotype - *

- *

- * In other words, it will ignore subpaths that are not valid part of an assembled haplotype. - *

- * - * @param start start seed vertex. - * @param result destination for found extensions. - * @param forward whether to traverse edges forward or backwards. - */ - private void extendsHaplotypeRoutesFrom(final MultiDeBruijnVertex start, final Set> result, final boolean forward) { - final Set validHaplotypeRoutes = haplotypeGraph.getEnclosingHaplotypeRoutes(start); - if (validHaplotypeRoutes.size() == 0) return; - final Deque, Set>> queue = new LinkedList<>(); - queue.add(new Pair<>(new Route<>(start, haplotypeGraph), validHaplotypeRoutes)); - while (!queue.isEmpty()) { - final Pair, Set> current = queue.remove(); - final Route path = current.getFirst(); - final MultiDeBruijnVertex vertex = forward ? path.getLastVertex() : path.getFirstVertex(); - final Set validRoutes = current.getSecond(); - for (final HaplotypeRoute hr : validRoutes) { - final MultiDeBruijnVertex routeEndVertex = forward ? hr.getLastVertex() : hr.getFirstVertex(); - if (vertex.equals(routeEndVertex)) { - result.add(path); - break; - } - } - final Set nextVertices = forward ? haplotypeGraph.outgoingVerticesOf(vertex) : - haplotypeGraph.incomingVerticesOf(vertex); - for (final MultiDeBruijnVertex candidate : nextVertices) { - extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(forward, queue, path, validRoutes, candidate); - } - } - } - - /** - * Check on an candidate vertice to exted a path. - * - *

- * This method updates the traversal queue accordingly. - *

- * - * @param forward whether the extension is forward, or backwards. - * @param queue queue with open paths yet to be explored. - * @param path path extension to evaluate. - * @param validRoutes collection of valid haplotype routes used to discard non-informative extensions. - * @param candidate the candidate extending vertex. - */ - private void extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex( - final boolean forward, - final Deque, Set>> queue, - final Route path, - final Set validRoutes, final MultiDeBruijnVertex candidate) { - final Set parentValidHaplotypes = haplotypeGraph.getEnclosingHaplotypeRoutes(candidate); - switch (parentValidHaplotypes.size()) { - case 0: - return; - case 1: - if (validRoutes.containsAll(parentValidHaplotypes)) - queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes)); - else - return; - break; - default: - if (parentValidHaplotypes.size() == validRoutes.size() && parentValidHaplotypes.containsAll(validRoutes)) { - queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes)); - } else { - final Set newValidHaplotypeRoutes = new HashSet<>(validRoutes.size()); - for (final HaplotypeRoute hr : validRoutes) - if (parentValidHaplotypes.contains(hr)) - newValidHaplotypeRoutes.add(hr); - if (newValidHaplotypeRoutes.size() == 0) - return; - queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), newValidHaplotypeRoutes)); - } - } - } - - public List getHaplotypeList() { - return new ArrayList<>(haplotypeGraph.getHaplotypes()); - } - - /** - * Returns the haplotype graph associated with this instance. - * @return never {@code null} - */ - public HaplotypeGraph getHaplotypeGraph() { - return haplotypeGraph; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java deleted file mode 100644 index 82015d153..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ /dev/null @@ -1,1206 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import net.sf.samtools.SAMFileWriter; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.gvcf.GVCFWriter; -import org.broadinstitute.sting.utils.haplotype.*; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.*; - -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. - * - *

Input

- *

- * Input bam file(s) from which to make calls - *

- * - *

Output

- *

- * VCF file with raw, unrecalibrated SNP and indel calls. - *

- * - *

Examples

- *
- *   java
- *     -jar GenomeAnalysisTK.jar
- *     -T HaplotypeCaller
- *     -R reference/human_g1k_v37.fasta
- *     -I sample1.bam [-I sample2.bam ...] \
- *     --dbsnp dbSNP.vcf \
- *     -stand_call_conf [50.0] \
- *     -stand_emit_conf 10.0 \
- *     [-L targets.interval_list]
- *     -o output.raw.snps.indels.vcf
- * 
- * - *

Caveats

- *
    - *
  • The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.
  • - *
- * - * @author rpoplin - * @since 8/22/11 - */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.LOCUS) -@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionTraversalParameters(extension=100, maxRegion=300) -@ReadFilters({HCMappingQualityFilter.class}) -@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) -public class HaplotypeCaller extends ActiveRegionWalker, Integer> implements AnnotatorCompatible, NanoSchedulable { - // ----------------------------------------------------------------------------------------------- - // general haplotype caller arguments - // ----------------------------------------------------------------------------------------------- - - /** - * A raw, unfiltered, highly sensitive callset in VCF format. - */ - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - @Hidden - @Advanced - @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", - doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) - protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM; - - @Hidden - @Advanced - @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false) - protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN; - - @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) - protected PrintStream graphWriter = null; - - /** - * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. - * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. - * - * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to - * use in specific areas where you want to better understand why the HC is making specific calls. - * - * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches - * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended - * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more - * easily see which reads go with these haplotype. - * - * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire - * active region, coming from read HC and a special read group. - * - * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean - * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to - * its next best haplotype. - * - * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, - * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV - * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen - * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png - * - */ - @Advanced - @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) - protected StingSAMFileWriter bamWriter = null; - private HaplotypeBAMWriter haplotypeBAMWriter; - - /** - * The type of BAM output we want to see. - */ - @Advanced - @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) - public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; - - /** - * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. - * dbSNP is not used in any way for the calculations themselves. - */ - @ArgumentCollection - protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - private double log10GlobalReadMismappingRate; - - public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } - - /** - * If a call overlaps with a record from the provided comp track, the INFO field will be annotated - * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). - * Records that are filtered in the comp track will be ignored. - * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). - */ - @Advanced - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) - public List> comps = Collections.emptyList(); - public List> getCompRodBindings() { return comps; } - - // The following are not used by the Unified Genotyper - public RodBinding getSnpEffRodBinding() { return null; } - public List> getResourceRodBindings() { return Collections.emptyList(); } - public boolean alwaysAppendDbsnpId() { return false; } - - /** - * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. - */ - @Advanced - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); - - /** - * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, - * so annotations will be excluded even if they are explicitly included with the other options. - */ - @Advanced - @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); - - /** - * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. - */ - @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected String[] annotationClassesToUse = { "Standard" }; - - @ArgumentCollection - private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); - - // ----------------------------------------------------------------------------------------------- - // arguments to control internal behavior of the read threading assembler - // ----------------------------------------------------------------------------------------------- - - @Advanced - @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) - protected List kmerSizes = Arrays.asList(10, 25); - - @Advanced - @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) - protected boolean dontIncreaseKmerSizesForCycles = false; - - @Advanced - @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) - protected int numPruningSamples = 1; - - @Hidden - @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) - protected boolean dontRecoverDanglingTails = false; - - // ----------------------------------------------------------------------------------------------- - // general advanced arguments to control haplotype caller behavior - // ----------------------------------------------------------------------------------------------- - - @Advanced - @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false) - protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; - - public enum ReferenceConfidenceMode { - NONE, - BP_RESOLUTION, - GVCF - } - - /** - * The GQ partition intervals - * - * Should be a non-empty list of boundaries. For example, suppose this variable is - * - * [A, B, C] - * - * We would partition our hom-ref sites into the following bands: - * - * X < A - * A <= X < B - * B <= X < C - * X >= C - * - * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: - * - * [0, 0] - * (0, 10] - * (10, 20] - * (20, 30] - * (30, 40] - * (40, 50] - * (50, 99] - * - * Note that in the GATK GQ values are capped at 99. - */ - @Advanced - @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) - protected List GVCFGQBands = Arrays.asList(5, 20, 60); - - /** - * This parameter determines the maximum size of an indel considered as potentially segregating in the - * reference model. It is used to eliminate reads from being indel informative at a site, and determines - * by that mechanism the certainty in the reference base. Conceptually, setting this parameter to - * X means that each informative read is consistent with any indel of size < X being present at a specific - * position in the genome, given its alignment to the reference. - */ - @Advanced - @Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false) - protected int indelSizeToEliminateInRefModel = 10; - - // ----------------------------------------------------------------------------------------------- - // general advanced arguments to control haplotype caller behavior - // ----------------------------------------------------------------------------------------------- - - /** - * Users should be aware that this argument can really affect the results of the variant calling and should exercise caution. - * Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the - * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values - * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher - * depth to produce calls). - */ - @Advanced - @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 2; - - @Advanced - @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) - protected int gcpHMM = 10; - - /** - * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling - * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the - * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking - * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, - * and may make use of them in assembly and calling, where possible. - */ - @Hidden - @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) - protected boolean includeUnmappedReads = false; - - @Advanced - @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) - protected boolean USE_ALLELES_TRIGGER = false; - - @Advanced - @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) - protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; - - /** - * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their - * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of - * its edit distance from the reference, in that the read could have originated from the reference haplotype but - * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but - * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence - * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single - * read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference - * that this (and any) read could contribute against reference is Q30. - * - * Set this term to any negative number to turn off the global mapping rate - */ - @Advanced - @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) - protected int phredScaledGlobalReadMismappingRate = 45; - - /** - * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype - * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the - * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their - * weights, no matter how many paths are possible to generate from the graph. Putting this number too low - * will result in dropping true variation because paths that include the real variant are not even considered. - */ - @Advanced - @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 128; - - @Advanced - @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) - protected boolean mergeVariantsViaLD = false; - - // ----------------------------------------------------------------------------------------------- - // arguments for debugging / developing the haplotype caller - // ----------------------------------------------------------------------------------------------- - /** - * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. - */ - @Hidden - @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; - - @Hidden - @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) - protected String keepRG = null; - - @Hidden - @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) - protected boolean justDetermineActiveRegions = false; - - @Hidden - @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) - protected boolean dontGenotype = false; - - @Hidden - @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectKmers = false; - - @Advanced - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) - protected boolean DEBUG; - - @Hidden - @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) - protected boolean debugGraphTransformations = false; - - @Hidden // TODO -- not currently useful - @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) - protected boolean useLowQualityBasesForAssembly = false; - - @Hidden - @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) - protected boolean dontTrimActiveRegions = false; - - @Hidden - @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) - protected boolean dontUseSoftClippedBases = false; - - @Hidden - @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) - protected boolean captureAssemblyFailureBAM = false; - - @Hidden - @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) - protected boolean allowCyclesInKmerGraphToGeneratePaths = false; - - @Hidden - @Argument(fullName="noFpga", shortName="noFpga", doc="If provided, disables the use of the FPGA HMM implementation", required = false) - protected boolean noFpga = false; - - // Parameters to control read error correction - @Hidden - @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectReads = false; - - @Hidden - @Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected int kmerLengthForReadErrorCorrection = 25; - - @Hidden - @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) - protected int minObservationsForKmerToBeSolid = 20; - - /** - * the maximum extent into the full active region extension that we're willing to go in genotyping our events - */ - @Hidden - @Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false) - protected int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25; - - @Hidden - @Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false) - protected int MAX_GGA_ACTIVE_REGION_EXTENSION = 300; - - /** - * Include at least this many bases around an event for calling it - */ - @Hidden - @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false) - protected int PADDING_AROUND_OTHERS_FOR_CALLING = 150; - - @Hidden - @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) - protected int PADDING_AROUND_SNPS_FOR_CALLING = 20; - - /** - * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base - * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). - * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE. - */ - @Advanced - @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) - public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; - - // ----------------------------------------------------------------------------------------------- - // done with Haplotype caller parameters - // ----------------------------------------------------------------------------------------------- - - // the UG engines - private UnifiedGenotyperEngine UG_engine = null; - private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; - - // the assembly engine - private LocalAssemblyEngine assemblyEngine = null; - - // the likelihoods engine - private LikelihoodCalculationEngine likelihoodCalculationEngine = null; - - // the genotyping engine - private GenotypingEngine genotypingEngine = null; - - // fasta reference reader to supplement the edges of the reference sequence - protected CachingIndexedFastaSequenceFile referenceReader; - - // reference base padding size - private static final int REFERENCE_PADDING = 500; - - private ActiveRegionTrimmer trimmer = null; - - private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument - private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument - - // bases with quality less than or equal to this value are trimmed off the tails of the reads - private static final byte MIN_TAIL_QUALITY = 20; - - private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - // the minimum length of a read we'd consider using for genotyping - private final static int MIN_READ_LENGTH = 10; - - private List samplesList = new ArrayList<>(); - - private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file - private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file - - ReferenceConfidenceModel referenceConfidenceModel = null; - - // as determined experimentally Nov-Dec 2013 - protected final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; - protected final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000; - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - super.initialize(); - - if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) - throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); - - // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - samplesList.addAll( samples ); - // initialize the UnifiedGenotyper Engine which is used to call into the exact model - final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine - UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - - // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.CONTAMINATION_FRACTION = 0.0; - simpleUAC.CONTAMINATION_FRACTION_FILE = null; - simpleUAC.exactCallsLog = null; - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - - if( UAC.CONTAMINATION_FRACTION_FILE != null ) { - UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); - } - - // initialize the output VCF header - final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - - Set headerInfo = new HashSet<>(); - - // all annotation fields from VariantAnnotatorEngine - headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); - // all callers need to add these standard annotation header lines - VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, - VCFConstants.DOWNSAMPLED_KEY, - VCFConstants.MLE_ALLELE_COUNT_KEY, - VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - // all callers need to add these standard FORMAT field header lines - VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true, - VCFConstants.GENOTYPE_KEY, - VCFConstants.GENOTYPE_QUALITY_KEY, - VCFConstants.DEPTH_KEY, - VCFConstants.GENOTYPE_PL_KEY); - - // FILTER fields are added unconditionally as it's not always 100% certain the circumstances - // where the filters are used. For example, in emitting all sites the lowQual field is used - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - - referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); - if ( emitReferenceConfidence() ) { - if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); - headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { - // a kluge to enforce the use of this indexing strategy - if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || - getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { - throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); - } - - try { - vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); - } catch ( IllegalArgumentException e ) { - throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); - } - } - } - - vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); - - try { - // fasta reference reader to supplement the edges of the reference sequence - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); - } - - // create and setup the assembler - assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); - - assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); - assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); - assemblyEngine.setDebug(DEBUG); - assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); - assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); - assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); - - if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); - if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); - - // setup the likelihood calculation engine - if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; - - // configure the global mismapping rate - if ( phredScaledGlobalReadMismappingRate < 0 ) { - log10GlobalReadMismappingRate = - Double.MAX_VALUE; - } else { - log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate); - logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); - } - - // create our likelihood calculation engine - likelihoodCalculationEngine = createLikelihoodCalculationEngine(); - - final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); - - genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); - - if ( bamWriter != null ) { - // we currently do not support multi-threaded BAM writing, so exception out - if ( getToolkit().getTotalNumberOfThreads() > 1 ) - throw new UserException.BadArgumentValue("bamout", "Currently cannot emit a BAM file from the HaplotypeCaller in multi-threaded mode."); - haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); - } - - trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING, - UAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ? MAX_GGA_ACTIVE_REGION_EXTENSION : MAX_DISCOVERY_ACTIVE_REGION_EXTENSION, - getToolkit().getGenomeLocParser()); - } - - /** - * Instantiates the appropriate likelihood calculation engine. - * - * @return never {@code null}. - */ - private LikelihoodCalculationEngine createLikelihoodCalculationEngine() { - switch (likelihoodEngineImplementation) { - case PairHMM: - return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); - case GraphBased: - return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations); - case Random: - return new RandomLikelihoodCalculationEngine(); - default: - //Note: we do not include in the error message list as it is of no grand public interest. - throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine + - "'. Please use one of the following instead: 'PairHMM' and 'GraphBased'."); - } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // isActive - // - //--------------------------------------------------------------------------------------------------------------- - - // enable deletions in the pileup - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - // enable non primary and extended reads in the active region - @Override - public EnumSet desiredReadStates() { - if ( includeUnmappedReads ) { - throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); -// return EnumSet.of( -// ActiveRegionReadState.PRIMARY, -// ActiveRegionReadState.NONPRIMARY, -// ActiveRegionReadState.EXTENDED, -// ActiveRegionReadState.UNMAPPED -// ); - } else - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED - ); - } - - @Override - @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) - public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); - if( vcFromAllelesRod != null ) { - return new ActivityProfileState(ref.getLocus(), 1.0); - } - } - - if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); - } - - if( context == null || context.getBasePileup().isEmpty() ) - // if we don't have any data, just abort early - return new ActivityProfileState(ref.getLocus(), 0.0); - - final List noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied - final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); - final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); - final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage(); - for( final Map.Entry sample : splitContexts.entrySet() ) { - final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods; - genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); - } - - final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE); - final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); - final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - - return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - private final static List NO_CALLS = Collections.emptyList(); - @Override - public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { - if ( justDetermineActiveRegions ) - // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work - return NO_CALLS; - - if( !originalActiveRegion.isActive() ) { - // Not active so nothing to do! - return referenceModelForNoVariation(originalActiveRegion, true); - } - - final List activeAllelesToGenotype = new ArrayList<>(); - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) { - if ( vc.isNotFiltered() ) { - activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode - } - } - // No alleles found in this region so nothing to do! - if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } - } else { - // No reads here so nothing to do! - if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } - } - - // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); - final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); - - // abort early if something is out of the acceptable range - if( ! assemblyResult.isVariationPresent() ) { - return referenceModelForNoVariation(originalActiveRegion, false); - } // only the reference haplotype remains so nothing else to do! - - if (dontGenotype) return NO_CALLS; // user requested we not proceed - - // filter out reads from genotyping which fail mapping quality based criteria - final Collection filteredReads = filterNonPassingReads( regionForGenotyping ); - final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - - if( regionForGenotyping.size() == 0 ) { - // no reads remain after filtering so nothing else to do! - return referenceModelForNoVariation(originalActiveRegion, false); - } - - // evaluate each sample's reads against all haplotypes - //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); - final List haplotypes = assemblyResult.getHaplotypeList(); - final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); - - // Calculate the likelihoods: CPU intesive part. - final Map stratifiedReadMap = - likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); - - - - - - // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there - // was a bad interaction between that selection and the marginalization that happens over each event when computing - // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the - // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included - // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] - - final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, - haplotypes, - stratifiedReadMap, - perSampleFilteredReadList, - assemblyResult.getFullReferenceWithPadding(), - assemblyResult.getPaddedReferenceLoc(), - regionForGenotyping.getLocation(), - getToolkit().getGenomeLocParser(), - metaDataTracker, - activeAllelesToGenotype ); - - // TODO -- must disable if we are doing NCT, or set the output type of ! presorted - if ( bamWriter != null ) { - haplotypeBAMWriter.writeReadsAlignedToHaplotypes( - haplotypes, - assemblyResult.getPaddedReferenceLoc(), - haplotypes, - calledHaplotypes.getCalledHaplotypes(), - stratifiedReadMap); - } - - if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - - if ( emitReferenceConfidence() ) { - if ( calledHaplotypes.getCalls().isEmpty() ) { - // no called all of the potential haplotypes - return referenceModelForNoVariation(originalActiveRegion, false); - } else - return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), - calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - stratifiedReadMap, calledHaplotypes.getCalls()); - } else { - return calledHaplotypes.getCalls(); - } - } - - /** - * High-level function that runs the assembler on the active region reads, - * returning a data structure with the resulting information needed - * for further HC steps - * - * @param activeRegion the region we should assemble - * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) - * @return the AssemblyResult describing how to proceed with genotyping - */ - protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { - // Create the reference haplotype which is the bases from the reference that make up the active region - finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails - - final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); - final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); - - // Create ReadErrorCorrector object if requested - will be used within assembly engine. - ReadErrorCorrector readErrorCorrector = null; - if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); - - try { - final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); - assemblyResultSet.debugDump(logger); - - if ( ! dontTrimActiveRegions ) { - final ActiveRegion trimmedActiveRegion = trimActiveRegion(assemblyResultSet,activeAllelesToGenotype); - if (trimmedActiveRegion != null) - return trimAssemblyResultSet(assemblyResultSet, trimmedActiveRegion); - else { - assemblyResultSet.resetVariationPresent(); - return assemblyResultSet; - } - } else - return assemblyResultSet; - } catch ( final Exception e ) { - // Capture any exception that might be thrown, and write out the assembly failure BAM if requested - if ( captureAssemblyFailureBAM ) { - final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); - for ( final GATKSAMRecord read : activeRegion.getReads() ) { - writer.addAlignment(read); - } - writer.close(); - } - throw e; - } - } - - /** - * Helper function to create the reference haplotype out of the active region and a padded loc - * @param activeRegion the active region from which to generate the reference haplotype - * @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be - * @return a non-null haplotype - */ - private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) { - return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc); - } - - /** - * Create an ref model result (ref model or no calls depending on mode) for an active region without any variation - * (not is active, or assembled to just ref) - * - * @param region the region to return a no-variation result - * @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done) - * @return a list of variant contexts (can be empty) to emit for this ref region - */ - private List referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) { - if ( emitReferenceConfidence() ) { - if ( needsToBeFinalized ) finalizeActiveRegion(region); - filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion - final GenomeLoc paddedLoc = region.getExtendedLoc(); - final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc); - final List haplotypes = Collections.singletonList(refHaplotype); - return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, - paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), - Collections.emptyList()); - } else { - return NO_CALLS; - } - } - - /** - * Create a context that maps each read to the reference haplotype with log10 L of 0 - * @param refHaplotype a non-null reference haplotype - * @param samples a list of all samples - * @param region the active region containing reads - * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref - */ - public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype, - final List samples, - final ActiveRegion region) { - final Allele refAllele = Allele.create(refHaplotype, true); - - final Map map = new LinkedHashMap<>(1); - for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - for ( final GATKSAMRecord read : entry.getValue() ) { - likelihoodMap.add(read, refAllele, 0.0); - } - map.put(entry.getKey(), likelihoodMap); - } - - return map; - } - - private ActiveRegion trimActiveRegion(final AssemblyResultSet resultSet, final Collection activeAllelesToGenotype) { - if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes"); - final List haplotypeList = resultSet.getHaplotypeList(); - final ActiveRegion originalGenotypingRegion = resultSet.getRegionForGenotyping(); - EventMap.buildEventMapsForHaplotypes(haplotypeList, resultSet.getFullReferenceWithPadding(), resultSet.getPaddedReferenceLoc(), DEBUG); - final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypeList); - allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype); - - final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalGenotypingRegion, allVariantsWithinFullActiveRegion,false); - if ( trimmedActiveRegion == null ) { - // there were no variants found within the active region itself, so just return null - if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)"); - return null; - } - - // trim down the reads and add them to the trimmed active region - final List trimmedReads = new ArrayList<>(originalGenotypingRegion.getReads().size()); - for( final GATKSAMRecord read : originalGenotypingRegion.getReads() ) { - final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, - trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); - if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) - trimmedReads.add(clippedRead); - } - trimmedActiveRegion.clearReads(); - trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); - - return trimmedActiveRegion; - } - - - /** - * Trims a assembly result set according to the active-region trimming. - * - * @param resultSet the original assembly result set. - * @param trimmedActiveRegion the trimmed active region to trim to. - * @return the assembly result set trimmed. - */ - private AssemblyResultSet trimAssemblyResultSet(final AssemblyResultSet resultSet, final ActiveRegion trimmedActiveRegion) { - if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes"); - - final List haplotypeList = resultSet.getHaplotypeList(); - - // trim down the haplotypes - final Map originalByTrimmedHaplotypes = new HashMap<>(); - - for ( final Haplotype h : haplotypeList ) { - final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); - - if ( trimmed != null ) { - if (originalByTrimmedHaplotypes.containsKey(trimmed)) { - if (trimmed.isReference()) { - originalByTrimmedHaplotypes.remove(trimmed); - originalByTrimmedHaplotypes.put(trimmed, h); - } - } else - originalByTrimmedHaplotypes.put(trimmed,h); - } else if (h.isReference()) - throw new IllegalStateException("trimming eliminates the reference haplotype"); - else if ( DEBUG ) { - logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + - " because it starts with or ends with an insertion or deletion when trimmed to " + - trimmedActiveRegion.getExtendedLoc()); - } - } - - // create the final list of trimmed haplotypes - final List trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet()); - - // resort the trimmed haplotypes. - Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator()); - final Map sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size()); - for (final Haplotype trimmed : trimmedHaplotypes) - sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed)); - - - if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + - trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + - haplotypeList.size() + " to only " + trimmedHaplotypes.size()); - if ( DEBUG ) - for ( final Haplotype remaining: trimmedHaplotypes ) - logger.info("Remains: " + remaining + " cigar " + remaining.getCigar()); - - return resultSet.trimTo(trimmedActiveRegion,sortedOriginalByTrimmedHaplotypes); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(List callsInRegion, Integer numCalledRegions) { - for( final VariantContext call : callsInRegion ) { - vcfWriter.add( call ); - } - return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions; - } - - @Override - public void onTraversalDone(Integer result) { - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it - referenceConfidenceModel.close(); - //TODO remove the need to call close here for debugging, the likelihood output stream should be managed - //TODO (open & close) at the walker, not the engine. - //likelihoodCalculationEngine.close(); - logger.info("Ran local assembly on " + result + " active regions"); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // private helper functions - // - //--------------------------------------------------------------------------------------------------------------- - - private void finalizeActiveRegion( final ActiveRegion activeRegion ) { - if (activeRegion.isFinalized()) return; - - if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } - - // Loop through the reads hard clipping the adaptor and low quality tails - final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); - for( final GATKSAMRecord myRead : activeRegion.getReads() ) { - GATKSAMRecord clippedRead; - if (errorCorrectReads) - clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION ); - else if (useLowQualityBasesForAssembly) - clippedRead = myRead; - else // default case: clip low qual ends of reads - clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY ); - - if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) { - // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all - clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); - } else { - // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches - // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't - // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion - // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the - // TODO -- reference haplotype start must be removed - clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); - } - - clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) ); - if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { - clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); - if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { - //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); - readsToUse.add(clippedRead); - } - } - } - - // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. - - final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); - - // handle overlapping read pairs from the same fragment - cleanOverlappingReadPairs(downsampledReads); - - activeRegion.clearReads(); - activeRegion.addAll(downsampledReads); - activeRegion.setFinalized(true); - } - - private Set filterNonPassingReads( final ActiveRegion activeRegion ) { - final Set readsToRemove = new LinkedHashSet<>(); - for( final GATKSAMRecord rec : activeRegion.getReads() ) { - if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { - readsToRemove.add(rec); - } - } - activeRegion.removeAll( readsToRemove ); - return readsToRemove; - } - - private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { - final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); - final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); - return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); - } - - private Map> splitReadsBySample( final Collection reads ) { - return splitReadsBySample(samplesList, reads); - } - - public static Map> splitReadsBySample( final List samplesList, final Collection reads ) { - final Map> returnMap = new HashMap<>(); - for( final String sample : samplesList) { - List readList = returnMap.get( sample ); - if( readList == null ) { - readList = new ArrayList<>(); - returnMap.put(sample, readList); - } - } - for( final GATKSAMRecord read : reads ) { - returnMap.get(read.getReadGroup().getSample()).add(read); - } - - return returnMap; - } - - /** - * Are we emitting a reference confidence in some form, or not? - * @return true if we are - */ - private boolean emitReferenceConfidence(){ - return emitReferenceConfidence != ReferenceConfidenceMode.NONE; - } - - /** - * Clean up reads/bases that overlap within read pairs - * - * @param reads the list of reads to consider - */ - private void cleanOverlappingReadPairs(final List reads) { - for ( final List perSampleReadList : splitReadsBySample(reads).values() ) { - final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); - for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) - FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java deleted file mode 100644 index 01ab421b3..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ /dev/null @@ -1,467 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.util.*; - -/** - * Haplotype-based resolution of variants in 2 different eval files. - * - *

- * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. - * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. - * Records are annotated with the set and status attributes. - * - *

Input

- *

- * 2 variant files to resolve. - *

- * - *

Output

- *

- * A single consensus VCF. - *

- * - *

Examples

- *
- * java -Xmx1g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T HaplotypeResolver \
- *   -V:v1 input1.vcf \
- *   -V:v2 input2.vcf \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW)) -public class HaplotypeResolver extends RodWalker { - - protected static final String INTERSECTION_SET = "intersection"; - protected static final String SAME_STATUS = "same"; - protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch"; - protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles"; - protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype"; - protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther"; - protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents"; - - protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50; - protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000; - protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100; - protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER; - - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - @Output(doc="File to which variants should be written") - protected VariantContextWriter baseWriter = null; - private VariantContextWriter writer; - - /** - * Set to 'null' if you don't want the set field emitted. - */ - @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) - protected String SET_KEY = "set"; - - /** - * Set to 'null' if you don't want the status field emitted. - */ - @Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false) - protected String STATUS_KEY = "status"; - - private final LinkedList queue = new LinkedList(); - private String source1, source2; - private final List sourceVCs1 = new ArrayList(); - private final List sourceVCs2 = new ArrayList(); - - - private class VCcontext { - public final Collection vcs; - public final GenomeLoc loc; - public final ReferenceContext ref; - - public VCcontext(final Collection vcs, final ReferenceContext ref) { - this.vcs = vcs; - this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next()); - this.ref = ref; - } - } - - public void initialize() { - - if ( variants.size() != 2 ) { - throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files"); - } - source1 = variants.get(0).getName(); - source2 = variants.get(1).getName(); - - if ( SET_KEY.toLowerCase().equals("null") ) - SET_KEY = null; - if ( STATUS_KEY.toLowerCase().equals("null") ) - STATUS_KEY = null; - - // for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header) - Set headerLines = new HashSet(); - if ( SET_KEY != null ) - headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record")); - if ( STATUS_KEY != null ) - headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match")); - final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.emptySet()); - baseWriter.writeHeader(vcfHeader); - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - final Collection VCs = tracker.getValues(variants, context.getLocation()); - if ( VCs.size() == 0 ) - return 0; - - final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref); - - // TODO -- what should we do about filtered records? - - if ( !queue.isEmpty() ) { - - final VCcontext previous = queue.getLast(); - if ( !previous.loc.onSameContig(vc.loc) || - previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS || - queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) { - purgeQueue(); - } - } - - queue.addLast(vc); - return 0; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - if ( !queue.isEmpty() ) - purgeQueue(); - writer.close(); - } - - private void purgeQueue() { - - final ReferenceContext refContext = queue.getFirst().ref; - - // divide them up by source - while ( !queue.isEmpty() ) { - VCcontext context = queue.removeFirst(); - for ( final VariantContext vc: context.vcs ) { - if ( vc.getSource().equals(source1) ) - sourceVCs1.add(vc); - else - sourceVCs2.add(vc); - } - } - - writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS); - - if ( sourceVCs1.isEmpty() ) { - writeAll(sourceVCs2, source2, null); - } else if ( sourceVCs2.isEmpty() ) { - writeAll(sourceVCs1, source1, null); - } else { - resolveByHaplotype(refContext); - } - - // allow for GC of the data - sourceVCs1.clear(); - sourceVCs2.clear(); - } - - private void writeAll(final List sourceVCs, final String set, final String status) { - for ( final VariantContext vc : sourceVCs ) { - writeOne(vc, set, status); - } - } - - private void writeOne(final VariantContext vc, final String set, final String status) { - final Map attrs = new HashMap(vc.getAttributes()); - if ( SET_KEY != null && set != null ) - attrs.put(SET_KEY, set); - if ( STATUS_KEY != null && status != null ) - attrs.put(STATUS_KEY, status); - writer.add(new VariantContextBuilder(vc).attributes(attrs).make()); - } - - private void writeAndPurgeAllEqualVariants(final List sourceVCs1, final List sourceVCs2, final String status) { - - int currentIndex1 = 0, currentIndex2 = 0; - int size1 = sourceVCs1.size(), size2 = sourceVCs2.size(); - VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); - VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); - - while ( current1 != null && current2 != null ) { - - final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); - final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); - - if ( loc1.equals(loc2) || - (loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) { - // test the alleles - if ( determineAndWriteOverlap(current1, current2, status) ) { - sourceVCs1.remove(currentIndex1); - sourceVCs2.remove(currentIndex2); - size1--; - size2--; - } else { - currentIndex1++; - currentIndex2++; - } - current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); - current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); - } else if ( loc1.isBefore(loc2) ) { - currentIndex1++; - current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); - } else { - currentIndex2++; - current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); - } - } - } - - private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) { - final int allelesFrom1In2 = findOverlap(vc1, vc2); - final int allelesFrom2In1 = findOverlap(vc2, vc1); - final int totalAllelesIn1 = vc1.getAlternateAlleles().size(); - final int totalAllelesIn2 = vc2.getAlternateAlleles().size(); - - final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1; - final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2; - - boolean thereIsOverlap = true; - - if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) { - writeOne(vc1, INTERSECTION_SET, status); - } else if ( allAllelesFrom1Overlap ) { - writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2); - } else if ( allAllelesFrom2Overlap ) { - writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1); - } else if ( allelesFrom1In2 > 0 ) { - writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS); - } else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction - writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS); - } else { - thereIsOverlap = false; - } - - return thereIsOverlap; - } - - private static int findOverlap(final VariantContext target, final VariantContext comparison) { - int overlap = 0; - for ( final Allele allele : target.getAlternateAlleles() ) { - if ( comparison.hasAlternateAllele(allele) ) - overlap++; - } - return overlap; - } - - private static final double SW_MATCH = 4.0; - private static final double SW_MISMATCH = -10.0; - private static final double SW_GAP = -25.0; - private static final double SW_GAP_EXTEND = -1.3; - private void resolveByHaplotype(final ReferenceContext refContext) { - - final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext); - final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext); - - final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - - // protect against SW failures - if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 || - swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) { - // TODO -- handle errors appropriately - logger.debug("Bad SW alignment; aborting at " + refContext.getLocus()); - return; - } - - // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); - if ( source1Map.size() == 0 || source2Map.size() == 0 ) { - // TODO -- handle errors appropriately - logger.debug("No source alleles; aborting at " + refContext.getLocus()); - return; - } - - // create lists and test for equality - final List source1Alleles = new ArrayList(source1Map.values()); - final List source2Alleles = new ArrayList(source2Map.values()); - - writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS); - if ( source1Alleles.isEmpty() ) { - writeAll(source2Alleles, source2, null); - } else if ( source2Alleles.isEmpty() ) { - writeAll(source1Alleles, source1, null); - } else { - writeDifferences(source1Alleles, source2Alleles); - } - } - - private byte[] generateHaplotype(final List sourceVCs, final ReferenceContext refContext) { - - final StringBuilder sb = new StringBuilder(); - - final int startPos = refContext.getWindow().getStart(); - int currentPos = startPos; - final byte[] reference = refContext.getBases(); - - for ( final VariantContext vc : sourceVCs ) { - // add any missing reference context - int vcStart = vc.getStart(); - final int refAlleleLength = vc.getReference().length(); - if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position) - vcStart++; - - while ( currentPos < vcStart ) - sb.append((char)reference[currentPos++ - startPos]); - - // add the alt allele - sb.append(vc.getAlternateAllele(0).getBaseString()); - - // skip the reference allele - currentPos += refAlleleLength; - } - // add any missing reference context - final int stopPos = refContext.getWindow().getStop(); - while ( currentPos < stopPos ) - sb.append((char)reference[currentPos++ - startPos]); - - return sb.toString().getBytes(); - } - - private void writeDifferences(final List source1Alleles, final List source2Alleles) { - int currentIndex1 = 0, currentIndex2 = 0; - final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); - VariantContext current1 = source1Alleles.get(0); - VariantContext current2 = source2Alleles.get(0); - - while ( currentIndex1 < size1 || currentIndex2 < size2 ) { - if ( current1 == null ) { - writeOne(current2, source2, null); - currentIndex2++; - current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); - } else if ( current2 == null ) { - writeOne(current1, source1, null); - currentIndex1++; - current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); - } else { - - final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); - final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); - - if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) { - String status; - if ( loc1.getStart() == loc2.getStart() ) { - final String allele1 = current1.getAlternateAllele(0).getBaseString(); - final String allele2 = current2.getAlternateAllele(0).getBaseString(); - if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 ) - status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; - else - status = SAME_START_DIFFERENT_ALLELES_STATUS; - } else { - status = OVERLAPPING_EVENTS_STATUS; - } - - writeOne(current1, INTERSECTION_SET, status); - currentIndex1++; - currentIndex2++; - current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); - current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); - } else if ( loc1.isBefore(loc2) ) { - writeOne(current1, source1, null); - currentIndex1++; - current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); - } else { - writeOne(current2, source2, null); - currentIndex2++; - current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); - } - } - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java deleted file mode 100644 index a6c35bce0..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java +++ /dev/null @@ -1,461 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - - -import com.sun.istack.internal.NotNull; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.haplotype.Haplotype; - -import java.lang.reflect.Array; -import java.util.*; - -/** - * Represent a sequence of kmers where any two consecutive kmers overlap in kmer length - 1 elements. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> - */ -public class KmerSequence implements List { - private final byte[] sequence; - private final int start; - private final int size; - private final int kmerSize; - private final int rawLength; - - /** - * Creates a kmer sequence from a read's sequence. - * - * @param read the read to represent as a sequence of kmers. - * @param kmerSize the kmer size. - */ - public KmerSequence(final SAMRecord read, final int kmerSize) { - this(read.getReadBases(), kmerSize); - } - - /** - * Creates a kmer sequence from a haplotype's sequence. - * - * @param hap the haplotype to represent as a sequence of kmers. - * @param kmerSize the kmer size. - */ - public KmerSequence(final Haplotype hap, final int kmerSize) { - this(hap.getBases(), kmerSize); - } - - /** - * Creates a kmer sequence out of a byte sequence. - * - * @param sequence the byte array to represent as a kmer sequence. - * @param kmerSize the kmer size. - */ - public KmerSequence(final byte[] sequence, final int kmerSize) { - this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length); - } - - - /** - * Creates a kmer sequence out of a range of a byte array - * - * @param sequence the input array. - * @param start inclusive first position of the array that maps to the first position in the first kmer. - * @param size number kmers in the output. - * @param kmerSize kmer length in bases. - * @param rawLength the of the range in bases. - */ - protected KmerSequence(final byte[] sequence, final int start, final int size, final int kmerSize, final int rawLength) { - if (sequence == null) { - throw new IllegalArgumentException("start must be 0 or greater"); - } - if (rawLength > sequence.length - start) { - throw new IllegalArgumentException("the raw sequence length goes beyond the array capacity"); - } - if (size < 0) { - throw new IllegalArgumentException("the length cannot be negative"); - } - if (start < 0) { - throw new IllegalArgumentException("start must be 0 or greater"); - } - if (size > 0 && size + kmerSize - 1 > rawLength) { - throw new IllegalArgumentException( - String.format("the kmerSize (%d) + size (%d) - 1 cannot be larger than rawLength (%d)",kmerSize,size,rawLength) ); - } - this.sequence = sequence; - this.start = start; - this.size = size; - this.kmerSize = kmerSize; - this.rawLength = rawLength; - } - - public int kmerSize() { - return kmerSize; - } - - public KmerSequence subsequence(final int from, final int to) { - if (from < 0 || from > to) { - throw new IllegalArgumentException(); - } - if (to > size) { - throw new IllegalArgumentException(); - } - return new KmerSequence(sequence,this.start + from,to - from,kmerSize,rawLength - from - (size - to)); - } - - - @Override - public int size() { - return size; - } - - @Override - public boolean isEmpty() { - return size == 0; - } - - @Override - public boolean contains(final Object o) { - if (o instanceof Kmer) { - if (o instanceof MyKmer) { - final MyKmer k = (MyKmer) o; - if (k.bases == sequence && k.start >= start && k.length == kmerSize && k.start < start + size) { - return true; - } - } - final Kmer k = (Kmer) o; - if (k.length != kmerSize) { - return false; - } - for (int i = 0; i < size; i++) { - int j; - for (j = 0; j < kmerSize; j++) { - if (sequence[start + i + j] != k.bases[k.start + j]) { - break; - } - } - if (j == kmerSize) { - return true; - } - } - return false; - } else { - return false; - } - } - - @Override - @NotNull - public Iterator iterator() { - return new Iterator() { - - private int offset = 0; - - @Override - public boolean hasNext() { - return offset < size; - } - - @Override - public Kmer next() { - return new Kmer(sequence,start + offset,kmerSize); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - - @NotNull - @Override - public Object[] toArray() { - return toArray(new Kmer[size()]); - } - - @Override - @NotNull - @SuppressWarnings("unchecked") - public T[] toArray(@NotNull final T[] a) { - if (a == null) { - throw new IllegalArgumentException(); - } else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) { - throw new IllegalArgumentException(); - } else { - T[] result; - if (a.length < size) { - result = (T[]) Array.newInstance(a.getClass().getComponentType(), size); - } else { - result = a; - } - for (int i = 0; i < size; i++) { - result[i] = (T) new Kmer(sequence,start + i,kmerSize); - } - return result; - } - } - - @Override - public boolean add(final Kmer kmer) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean remove(final Object o) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean containsAll(final Collection c) { - for (final Object o : c) - if (!contains(o)) - return false; - return true; - } - - @Override - public boolean addAll(final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean addAll(final int index, @NotNull final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean removeAll(@NotNull final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean retainAll(@NotNull final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public void clear() { - throw new UnsupportedOperationException(); - } - - @Override - public Kmer get(final int index) { - if (index < 0 || index >= size) { - throw new IllegalArgumentException(); - } - return new Kmer(sequence,start + index,kmerSize); - } - - @Override - public Kmer set(final int index, final Kmer element) { - throw new UnsupportedOperationException(); - } - - @Override - public void add(final int index, final Kmer element) { - throw new UnsupportedOperationException(); - } - - @Override - public Kmer remove(final int index) { - throw new UnsupportedOperationException(); - } - - @Override - public int indexOf(final Object o) { - if (o instanceof Kmer) { - final Kmer k = (Kmer) o; - if (k.length != kmerSize) { - return -1; - } - for (int i = 0; i < size; i++) { - int j; - for (j = 0; j < kmerSize; j++) { - if (sequence[start + i + j] != k.bases[k.start + j]) { - break; - } - } - if (j == kmerSize) { - return i; - } - } - return -1; - } else { - return -1; - } - } - - @Override - public int lastIndexOf(final Object o) { - if (o instanceof Kmer) { - final Kmer k = (Kmer) o; - if (k.length != kmerSize) { - return -1; - } - for (int i = size - 1; i >= 0; i--) { - int j; - for (j = kmerSize - 1; j >= 0; j--) { - if (sequence[start + i + j] != k.bases[k.start + j]) { - break; - } - } - if (j == 0) { - return i; - } - } - return -1; - } else { - return -1; - } - } - - @Override - @NotNull - public ListIterator listIterator() { - return new MyListIterator(0); - } - - @Override - @NotNull - public ListIterator listIterator(final int index) { - return new MyListIterator(index); - } - - @Override - @NotNull - public List subList(final int fromIndex, final int toIndex) { - return subsequence(fromIndex,toIndex); - } - - /** - * Returns the byte array representation of the kmer sequence. - * @return never {@code null}. - */ - @NotNull - public byte[] getBytes() { - if (start == 0 && rawLength == sequence.length) - return sequence; - else - return Arrays.copyOfRange(sequence, start, rawLength + start); - } - - /** - * Internal class that implements the {@link Kmer} more efficiently - * making reference to the sequence's own byte array. - */ - protected class MyKmer extends Kmer { - - /** - * Create a new instance give the offset in the byte array. - * @param start the start base offset for the kmer. - */ - public MyKmer(final int start) { - super(sequence,start,kmerSize); - } - } - - /** - * Iterator implementation of Kmer elements. - */ - private class MyListIterator implements ListIterator { - - private int i = 0; - - /** - * Creates a iterator at certain offset in the sequence. - * @param idx the start position or kmer offset. - */ - private MyListIterator(final int idx) { - i = idx; - } - - @Override - public boolean hasNext() { - return i < size; - } - - @Override - public Kmer next() { - return new Kmer(sequence,start + i++,kmerSize); - } - - @Override - public boolean hasPrevious() { - return i > 0; - } - - @Override - public Kmer previous() { - return new Kmer(sequence,start + --i,kmerSize); - } - - @Override - public int nextIndex() { - return i; - } - - @Override - public int previousIndex() { - return i - 1; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - @Override - public void set(final Kmer kmer) { - throw new UnsupportedOperationException(); - } - - @Override - public void add(final Kmer kmer) { - throw new UnsupportedOperationException(); - } - - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java deleted file mode 100644 index d0e28d878..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ /dev/null @@ -1,459 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Abstract base class for all HaplotypeCaller assemblers - * - * User: ebanks - * Date: Mar 14, 2011 - */ -public abstract class LocalAssemblyEngine { - private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class); - - /** - * If false, we will only write out a region around the reference source - */ - private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true; - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 10; - private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30; - - protected final int numBestHaplotypesPerGraph; - - protected boolean debug = false; - protected boolean allowCyclesInKmerGraphToGeneratePaths = false; - protected boolean debugGraphTransformations = false; - protected boolean recoverDanglingTails = true; - - protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; - protected int pruneFactor = 2; - protected boolean errorCorrectKmers = false; - - private PrintStream graphWriter = null; - - /** - * Create a new LocalAssemblyEngine with all default parameters, ready for use - * @param numBestHaplotypesPerGraph the number of haplotypes to generate for each assembled graph - */ - protected LocalAssemblyEngine(final int numBestHaplotypesPerGraph) { - if ( numBestHaplotypesPerGraph < 1 ) throw new IllegalArgumentException("numBestHaplotypesPerGraph should be >= 1 but got " + numBestHaplotypesPerGraph); - this.numBestHaplotypesPerGraph = numBestHaplotypesPerGraph; - } - - /** - * Main subclass function: given reads and a reference haplotype give us graphs to use for constructing - * non-reference haplotypes. - * - * @param reads the reads we're going to assemble - * @param refHaplotype the reference haplotype - * @return a non-null list of reads - */ - protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes); - - protected List assemble(List reads, Haplotype refHaplotype) { - return assemble(reads, refHaplotype, Collections.emptyList()); - } - - /** - * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads - * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly - * @param refHaplotype reference haplotype object - * @param fullReferenceWithPadding byte array holding the reference sequence with padding - * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode - * @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used. - * @return the resulting assembly-result-set - */ - public AssemblyResultSet runLocalAssembly(final ActiveRegion activeRegion, - final Haplotype refHaplotype, - final byte[] fullReferenceWithPadding, - final GenomeLoc refLoc, - final List activeAllelesToGenotype, - final ReadErrorCorrector readErrorCorrector) { - if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } - if( activeRegion.getExtendedLoc() == null ) { throw new IllegalArgumentException("Active region must have an extended location."); } - if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } - if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } - if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } - - // create the list of artificial haplotypes that should be added to the graph for GGA mode - final List activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc()); - - // error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them - final List correctedReads; - if (readErrorCorrector != null) { - // now correct all reads in active region after filtering/downsampling - // Note that original reads in active region are NOT modified by default, since they will be used later for GL computation, - // and we only want the read-error corrected reads for graph building. - readErrorCorrector.addReadsToKmers(activeRegion.getReads()); - correctedReads = new ArrayList<>(readErrorCorrector.correctReads(activeRegion.getReads())); - } else { - correctedReads = activeRegion.getReads(); - } - - final List nonRefGraphs = new LinkedList<>(); - final AssemblyResultSet resultSet = new AssemblyResultSet(); - resultSet.setRegionForGenotyping(activeRegion); - resultSet.setFullReferenceWithPadding(fullReferenceWithPadding); - resultSet.setPaddedReferenceLoc(refLoc); - final GenomeLoc activeRegionExtendedLocation = activeRegion.getExtendedLoc(); - refHaplotype.setGenomeLocation(activeRegionExtendedLocation); - resultSet.add(refHaplotype); - final Map assemblyResultByGraph = new HashMap<>(); - // create the graphs by calling our subclass assemble method - for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) { - if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) { - // do some QC on the graph - sanityCheckGraph(result.getGraph(), refHaplotype); - // add it to graphs with meaningful non-reference features - assemblyResultByGraph.put(result.getGraph(),result); - nonRefGraphs.add(result.getGraph()); - } - - } - - findBestPaths (nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet); - - // print the graphs if the appropriate debug option has been turned on - if ( graphWriter != null ) { printGraphs(nonRefGraphs); } - - return resultSet; - } - - /** - * Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype - * @param refHaplotype the reference haplotype - * @param activeAllelesToGenotype the list of alternate alleles in VariantContexts - * @param activeRegionWindow the window containing the reference haplotype - * @return a non-null list of haplotypes - */ - private List createActiveAlleleHaplotypes(final Haplotype refHaplotype, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow) { - final Set returnHaplotypes = new LinkedHashSet<>(); - final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); - - for( final VariantContext compVC : activeAllelesToGenotype ) { - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype - returnHaplotypes.add(insertedRefHaplotype); - } - } - } - - return new ArrayList<>(returnHaplotypes); - } - - - @Ensures({"result.contains(refHaplotype)"}) - protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, - final Map assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) { - // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes - final Set returnHaplotypes = new LinkedHashSet<>(); - returnHaplotypes.add( refHaplotype ); - - final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); - - for( final SeqGraph graph : graphs ) { - final SeqVertex source = graph.getReferenceSourceVertex(); - final SeqVertex sink = graph.getReferenceSinkVertex(); - if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); - - final KBestPaths pathFinder = new KBestPaths<>(allowCyclesInKmerGraphToGeneratePaths); - for ( final Path path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) { - Haplotype h = new Haplotype( path.getBases() ); - if( !returnHaplotypes.contains(h) ) { - final Cigar cigar = path.calculateCigar(refHaplotype.getBases()); - - if ( cigar == null ) { - // couldn't produce a meaningful alignment of haplotype to reference, fail quietly - continue; - } else if( cigar.isEmpty() ) { - throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + - " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); - } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < MIN_HAPLOTYPE_REFERENCE_LENGTH ) { - // N cigar elements means that a bubble was too divergent from the reference so skip over this path - continue; - } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure - throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " - + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength() - + " ref = " + refHaplotype + " path " + new String(path.getBases())); - } - - h.setCigar(cigar); - h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setScore(path.getScore()); - h.setGenomeLocation(activeRegionWindow); - returnHaplotypes.add(h); - assemblyResultSet.add(h, assemblyResultByGraph.get(graph)); - - if ( debug ) - logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize()); - } - } - } - - - if ( returnHaplotypes.size() < returnHaplotypes.size() ) - logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); - - if( debug ) { - if( returnHaplotypes.size() > 1 ) { - logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); - } else { - logger.info("Found only the reference haplotype in the assembly graph."); - } - for( final Haplotype h : returnHaplotypes ) { - logger.info( h.toString() ); - logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() + " ref " + h.isReference()); - } - } - - return new ArrayList<>(returnHaplotypes); - - } - /** - * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal - * @param c the cigar to test - * @return true if we should skip over this path - */ - @Requires("c != null") - private boolean pathIsTooDivergentFromReference( final Cigar c ) { - for( final CigarElement ce : c.getCigarElements() ) { - if( ce.getOperator().equals(CigarOperator.N) ) { - return true; - } - } - return false; - } - - /** - * Print graph to file if debugGraphTransformations is enabled - * @param graph the graph to print - * @param file the destination file - */ - protected void printDebugGraphTransform(final BaseGraph graph, final File file) { - if ( debugGraphTransformations ) { - if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) - graph.printGraph(file, pruneFactor); - else - graph.subsetToRefSource().printGraph(file, pruneFactor); - } - } - - protected AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { - printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot")); - - // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive - seqGraph.zipLinearChains(); - printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot")); - - // now go through and prune the graph, removing vertices no longer connected to the reference chain - seqGraph.removeSingletonOrphanVertices(); - seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); - - printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot")); - seqGraph.simplifyGraph(); - printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot")); - - // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can - // happen in cases where for example the reference somehow manages to acquire a cycle, or - // where the entire assembly collapses back into the reference sequence. - if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) - return new AssemblyResult(AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE, seqGraph); - - seqGraph.removePathsNotConnectedToRef(); - seqGraph.simplifyGraph(); - if ( seqGraph.vertexSet().size() == 1 ) { - // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop - // the code from blowing up. - // TODO -- ref properties should really be on the vertices, not the graph itself - final SeqVertex complete = seqGraph.vertexSet().iterator().next(); - final SeqVertex dummy = new SeqVertex(""); - seqGraph.addVertex(dummy); - seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); - } - printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot")); - return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph); - } - - /** - * Perform general QC on the graph to make sure something hasn't gone wrong during assembly - * @param graph the graph to check - * @param refHaplotype the reference haplotype - */ - private void sanityCheckGraph(final BaseGraph graph, final Haplotype refHaplotype) { - sanityCheckReferenceGraph(graph, refHaplotype); - } - - /** - * Make sure the reference sequence is properly represented in the provided graph - * - * @param graph the graph to check - * @param refHaplotype the reference haplotype - */ - private void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { - if( graph.getReferenceSourceVertex() == null ) { - throw new IllegalStateException("All reference graphs must have a reference source vertex."); - } - if( graph.getReferenceSinkVertex() == null ) { - throw new IllegalStateException("All reference graphs must have a reference sink vertex."); - } - if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) { - throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path. for graph " + graph + - " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) + - " haplotype = " + new String(refHaplotype.getBases()) - ); - } - } - - /** - * Print the generated graphs to the graphWriter - * @param graphs a non-null list of graphs to print out - */ - private void printGraphs(final List graphs) { - final int writeFirstGraphWithSizeSmallerThan = 50; - - graphWriter.println("digraph assemblyGraphs {"); - for( final SeqGraph graph : graphs ) { - if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { - logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); - continue; - } - - graph.printGraph(graphWriter, false, pruneFactor); - - if ( debugGraphTransformations ) - break; - } - - graphWriter.println("}"); - } - - // ----------------------------------------------------------------------------------------------- - // - // getter / setter routines for generic assembler properties - // - // ----------------------------------------------------------------------------------------------- - - public int getPruneFactor() { - return pruneFactor; - } - - public void setPruneFactor(int pruneFactor) { - this.pruneFactor = pruneFactor; - } - - public boolean shouldErrorCorrectKmers() { - return errorCorrectKmers; - } - - public void setErrorCorrectKmers(boolean errorCorrectKmers) { - this.errorCorrectKmers = errorCorrectKmers; - } - - public void setGraphWriter(PrintStream graphWriter) { - this.graphWriter = graphWriter; - } - - public byte getMinBaseQualityToUseInAssembly() { - return minBaseQualityToUseInAssembly; - } - - public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) { - this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; - } - - public boolean isDebug() { - return debug; - } - - public void setDebug(boolean debug) { - this.debug = debug; - } - - public boolean isAllowCyclesInKmerGraphToGeneratePaths() { - return allowCyclesInKmerGraphToGeneratePaths; - } - - public void setAllowCyclesInKmerGraphToGeneratePaths(boolean allowCyclesInKmerGraphToGeneratePaths) { - this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths; - } - - public boolean isDebugGraphTransformations() { - return debugGraphTransformations; - } - - public void setDebugGraphTransformations(boolean debugGraphTransformations) { - this.debugGraphTransformations = debugGraphTransformations; - } - - public boolean isRecoverDanglingTails() { - return recoverDanglingTails; - } - - public void setRecoverDanglingTails(boolean recoverDanglingTails) { - this.recoverDanglingTails = recoverDanglingTails; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java deleted file mode 100644 index 9e54f7947..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ /dev/null @@ -1,834 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pairhmm.*; -import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; -import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.*; - -public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine { - private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class); - - private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual - - private final byte constantGCP; - private final double log10globalReadMismappingRate; - private final boolean DEBUG; - - private final PairHMM.HMM_IMPLEMENTATION hmmType; - private final boolean noFpga; - - private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() { - @Override - protected PairHMM initialValue() { - switch (hmmType) { - case EXACT: return new Log10PairHMM(true); - case ORIGINAL: return new Log10PairHMM(false); - case LOGLESS_CACHING: - if (noFpga || !CnyPairHMM.isAvailable()) - return new LoglessPairHMM(); - else - return new CnyPairHMM(); - case ARRAY_LOGLESS: - if (noFpga || !CnyPairHMM.isAvailable()) - return new ArrayLoglessPairHMM(); - else - return new CnyPairHMM(); - default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS."); - } - } - }; -// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException -// private final PairHMM pairHMM = pairHMMThreadLocal.get(); - - private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; - private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; - private final PrintStream likelihoodsStream; - - public enum PCR_ERROR_MODEL { - /** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */ - NONE, - /** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */ - AGGRESSIVE, - /** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */ - CONSERVATIVE - } - - private final PCR_ERROR_MODEL pcrErrorModel; - - /** - * The expected rate of random sequencing errors for a read originating from its true haplotype. - * - * For example, if this is 0.01, then we'd expect 1 error per 100 bp. - */ - private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02; - - /** - * Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations - * - * @param constantGCP the gap continuation penalty to use with the PairHMM - * @param debug should we emit debugging information during the calculation? - * @param hmmType the type of the HMM to use - * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of - * -3 means that the chance that a read doesn't actually belong at this - * location in the genome is 1 in 1000. The effect of this parameter is - * to cap the maximum likelihood difference between the reference haplotype - * and the best alternative haplotype by -3 log units. So if the best - * haplotype is at -10 and this parameter has a value of -3 then even if the - * reference haplotype gets a score of -100 from the pairhmm it will be - * assigned a likelihood of -13. - * @param noFpga disable FPGA acceleration - */ - public PairHMMLikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) { - this.hmmType = hmmType; - this.constantGCP = constantGCP; - this.DEBUG = debug; - this.log10globalReadMismappingRate = log10globalReadMismappingRate; - this.noFpga = noFpga; - this.pcrErrorModel = pcrErrorModel; - - initializePCRErrorModel(); - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - try { - likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME))); - } catch ( FileNotFoundException e ) { - throw new RuntimeException(e); - } - } else { - likelihoodsStream = null; - } - } - - public void close() { - if ( likelihoodsStream != null ) likelihoodsStream.close(); - } - - private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ - if ( WRITE_LIKELIHOODS_TO_FILE ) { - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(processedRead.getReadBases() ), - SAMUtils.phredToFastq(processedRead.getBaseQualities() ), - SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), - SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), - SAMUtils.phredToFastq(constantGCP), - log10l); - } - } - - private Map createAlleleMap(List haplotypes){ - final int numHaplotypes = haplotypes.size(); - final Map alleleMap = new LinkedHashMap<>(numHaplotypes); - for ( final Haplotype haplotype : haplotypes ) { - final Allele allele = Allele.create(haplotype, true); - alleleMap.put(allele, haplotype); - } - return alleleMap; - } - - private Map fillGCPArrays(List reads){ - final Map GCPArrayMap = new LinkedHashMap<>(); - for (GATKSAMRecord read: reads){ - byte [] GCPArray = new byte[read.getReadBases().length]; - Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - GCPArrayMap.put(read, GCPArray); - } - return GCPArrayMap; - } - - private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); - readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); - } - } - - /** - * Pre-processing of the reads to be evaluated at the current location from the current sample. - * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read. - * Modified copies of reads are packed into a new list, while original reads are retained for downstream use - * - * @param reads The original list of unmodified reads - * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding - */ - private List modifyReadQualities(final List reads) { - List processedReads = new LinkedList<>(); - for ( GATKSAMRecord read : reads ) { - - final byte[] readBases = read.getReadBases(); - - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); - final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); - - applyPCRErrorModel(readBases, readInsQuals, readDelQuals); - capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals); - - // Create a new copy of the read and sets its base qualities to the modified versions. - // Pack this into a new list for return - final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); - processedReads.add(processedRead); - } - return processedReads; - } - - /** - * Post-processing of the read/allele likelihoods. - * - * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. - * We wish to return a map containing the original, unmodified reads. - * - * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. - * This protects us from the case where the assembly has produced haplotypes - * that are very divergent from reference, but are supported by only one read. In effect - * we capping how badly scoring the reference can be for any read by the chance that the read - * itself just doesn't belong here - * - * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls - * @param reads Our original, unmodified reads - * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls - * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype - * - * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls - */ - private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ - - // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls - final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - - Allele refAllele = null; - final int numReads = reads.size(); - for (int readIndex = 0; readIndex < numReads; readIndex++) { - - // Get the original and quality-modified read from their respective lists - // Note that this requires both lists to have reads in the same order - final GATKSAMRecord originalRead = reads.get(readIndex); - final GATKSAMRecord processedRead = processedReads.get(readIndex); - - // keep track of the reference likelihood and the best non-ref likelihood - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - - for ( Allele allele : alleleHaplotypeMap.keySet() ) { - final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); - final Haplotype haplotype = alleleHaplotypeMap.get(allele); - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else { - refAllele = allele; - refLog10l = log10l; - } - writeDebugLikelihoods(processedRead, haplotype, log10l); - - // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l - processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); - } - - // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global - // mismapping rate. This protects us from the case where the assembly has produced haplotypes - // that are very divergent from reference, but are supported by only one read. In effect - // we capping how badly scoring the reference can be for any read by the chance that the read - // itself just doesn't belong here - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed); - } - } - return processedReadAlleleLikelihoodMap; - } - - /** - * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate - * - * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples - * against the set of haplotypes - * - * @param haplotypes a non-null list of haplotypes - * @param perSampleReadList a mapping from sample -> reads - */ - private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { - int X_METRIC_LENGTH = 0; - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - for( final GATKSAMRecord read : sample.getValue() ) { - final int readLength = read.getReadLength(); - if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } - } - } - int Y_METRIC_LENGTH = 0; - for( final Haplotype h : haplotypes ) { - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } - } - - // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); - } - - - @Override - public Map computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList ) { - - final List haplotypes = assemblyResultSet.getHaplotypeList(); - // configure the HMM - initializePairHMM(haplotypes, perSampleReadList); - - // Add likelihoods for each sample's reads to our stratifiedReadMap - final Map stratifiedReadMap = new LinkedHashMap<>(); - for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { - // evaluate the likelihood of the reads given those haplotypes - final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - - map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); - stratifiedReadMap.put(sampleEntry.getKey(), map); - } - - return stratifiedReadMap; - } - - - public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - - // Add likelihoods for each sample's reads to our stratifiedReadMap - final Map stratifiedReadMap = new LinkedHashMap<>(); - for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { - // evaluate the likelihood of the reads given those haplotypes - final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - - map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); - stratifiedReadMap.put(sampleEntry.getKey(), map); - } - - return stratifiedReadMap; - } - - private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { - - // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities - List processedReads = modifyReadQualities(reads); - - // Get alleles corresponding to our haplotypees - Map alleleHaplotypeMap = createAlleleMap(haplotypes); - - // Get an array containing the constantGCP for each read in our modified read list - Map GCPArrayMap = fillGCPArrays(processedReads); - - // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype - PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); - - // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate - - return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); - } - - @Requires({"alleleOrdering.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, - final Map stratifiedReadMap, - final List alleleOrdering, - final boolean normalize ) { - return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); - } - - @Requires({"alleleOrdering.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, - final Map stratifiedReadMap, - final List alleleOrdering, - final boolean normalize) { - - final int numHaplotypes = alleleOrdering.size(); - final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); - } - - // compute the diploid haplotype likelihoods - for( int iii = 0; iii < numHaplotypes; iii++ ) { - final Allele iii_allele = alleleOrdering.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Allele jjj_allele = alleleOrdering.get(jjj); - double haplotypeLikelihood = 0.0; - for( final String sample : samples ) { - for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) * - ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); - } - } - haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; - } - } - - // normalize the diploid likelihoods matrix - return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix; - } - - @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) - @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) - protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { - final int numHaplotypes = likelihoodMatrix.length; - double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; - int index = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ){ - genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj]; - } - } - genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); - index = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ){ - likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++]; - } - } - return likelihoodMatrix; - } - - // -------------------------------------------------------------------------------- - // - // System to compute the best N haplotypes for genotyping - // - // -------------------------------------------------------------------------------- -// -// /** -// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele -// * @param map an annoying map object that moves us between the allele and haplotype representation -// * @param haplotypeAsAllele the allele version of the haplotype -// * @return the haplotype version, with its score incremented by 1 if its non-reference -// */ -// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { -// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic -// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value -// return h; -// } -// -// /** -// * Take the best N haplotypes and return them as a list -// * -// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample -// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing -// * order of score (so higher score haplotypes are preferred). The N we take is determined by -// * -// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) -// * -// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is -// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound -// * -// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 -// * @param nSamples the number of samples used to select the haplotypes -// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples -// * @return a list of N or fewer haplotypes, with the reference haplotype first -// */ -// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { -// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); -// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); -// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; -// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); -// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); -// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); -// return bestHaplotypes; -// } -// -// /** -// * Select the best haplotypes for genotyping the samples in stratifiedReadMap -// * -// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely -// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for -// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get -// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation -// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the -// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. -// * -// * @param haplotypes a list of all haplotypes we're considering -// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype -// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes -// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation -// */ -// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { -// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); -// -// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes -// -// // all of the haplotypes that at least one sample called as one of the most likely -// final Set selectedHaplotypes = new HashSet<>(); -// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected -// -// // our annoying map from allele -> haplotype -// final Map allele2Haplotype = new HashMap<>(); -// for ( final Haplotype h : haplotypes ) { -// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes -// allele2Haplotype.put(Allele.create(h, h.isReference()), h); -// } -// -// // for each sample, compute the most likely pair of haplotypes -// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { -// // get the two most likely haplotypes under a diploid model for this sample -// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); -// -// if ( mla != null ) { // there was something to evaluate in this sample -// // note that there must be at least 2 haplotypes -// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); -// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); -// -//// if ( DEBUG ) { -//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); -//// } -// -// // add these two haplotypes to the set of haplotypes that have been selected -// selectedHaplotypes.add(best); -// selectedHaplotypes.add(second); -// -// // we've already selected all of our haplotypes, and we don't need to prune them down -// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) -// break; -// } -// } -// -// // take the best N haplotypes forward, in order of the number of samples that choose them -// final int nSamples = stratifiedReadMap.size(); -// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); -// -// if ( DEBUG ) { -// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); -// for ( final Haplotype h : bestHaplotypes ) { -// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); -// } -// } -// return bestHaplotypes; -// } -// -// /** -// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found -// * @param haplotypes non-null list of haplotypes -// * @return the reference haplotype -// */ -// private static Haplotype findReferenceHaplotype( final List haplotypes ) { -// for( final Haplotype h : haplotypes ) { -// if( h.isReference() ) return h; -// } -// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); -// } - - // -------------------------------------------------------------------------------- - // - // Experimental attempts at PCR error rate modeling - // - // -------------------------------------------------------------------------------- - - protected static final int MAX_STR_UNIT_LENGTH = 8; - protected static final int MAX_REPEAT_LENGTH = 20; - protected static final int MIN_ADJUSTED_QSCORE = 10; - protected static final double INITIAL_QSCORE = 40.0; - - private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1]; - private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); - - private void initializePCRErrorModel() { - if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) - return; - - repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH); - - pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1]; - - final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0; - - for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ ) - pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor); - } - - protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) { - return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 )); - } - - protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) { - if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) - return; - - for ( int iii = 1; iii < readBases.length; iii++ ) { - final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond(); - readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); - readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); - } - } - - // -------------------------------------------------------------------------------- - // - // Posterior GL calculations - // - // -------------------------------------------------------------------------------- - - public static VariantContext calculatePosteriorGLs(final VariantContext vc1, - final Collection resources, - final int numRefSamplesFromMissingResources, - final double globalFrequencyPriorDirichlet, - final boolean useInputSamples, - final boolean useEM, - final boolean useAC) { - if ( useEM ) - throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented"); - - final Map totalAlleleCounts = new HashMap<>(); - for ( final VariantContext resource : resources ) { - addAlleleCounts(totalAlleleCounts,resource,useAC); - } - - if ( useInputSamples ) { - addAlleleCounts(totalAlleleCounts,vc1,useAC); - } - - totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources); - - // now extract the counts of the alleles present within vc1, and in order - final double[] alleleCounts = new double[vc1.getNAlleles()]; - int alleleIndex = 0; - for ( final Allele allele : vc1.getAlleles() ) { - - alleleCounts[alleleIndex++] = globalFrequencyPriorDirichlet + ( totalAlleleCounts.containsKey(allele) ? - totalAlleleCounts.get(allele) : 0 ); - } - - final List likelihoods = new ArrayList<>(vc1.getNSamples()); - for ( final Genotype genotype : vc1.getGenotypes() ) { - likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); - } - - final List posteriors = calculatePosteriorGLs(likelihoods,alleleCounts,vc1.getMaxPloidy(2)); - - final GenotypesContext newContext = GenotypesContext.create(); - for ( int genoIdx = 0; genoIdx < vc1.getNSamples(); genoIdx ++ ) { - final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx)); - if ( posteriors.get(genoIdx) != null ) { - GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, - GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); - builder.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, - Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); - - } - newContext.add(builder.make()); - } - - final List priors = Utils.listFromPrimitives( - GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2))).getAsPLs()); - - return new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG",priors).make(); - } - - /** - * Given genotype likelihoods and known allele counts, calculate the posterior likelihoods - * over the genotype states - * @param genotypeLikelihoods - the genotype likelihoods for the individual - * @param knownAlleleCountsByAllele - the known allele counts in the population. For AC=2 AN=12 site, this is {10,2} - * @param ploidy - the ploidy to assume - * @return - the posterior genotype likelihoods - */ - protected static List calculatePosteriorGLs(final List genotypeLikelihoods, - final double[] knownAlleleCountsByAllele, - final int ploidy) { - if ( ploidy != 2 ) { - throw new IllegalStateException("Genotype posteriors not yet implemented for ploidy != 2"); - } - - final double[] genotypePriorByAllele = getDirichletPrior(knownAlleleCountsByAllele,ploidy); - final List posteriors = new ArrayList<>(genotypeLikelihoods.size()); - for ( final double[] likelihoods : genotypeLikelihoods ) { - double[] posteriorLikelihoods = null; - - if ( likelihoods != null ) { - if ( likelihoods.length != genotypePriorByAllele.length ) { - throw new IllegalStateException(String.format("Likelihoods not of correct size: expected %d, observed %d", - knownAlleleCountsByAllele.length*(knownAlleleCountsByAllele.length+1)/2,likelihoods.length)); - } - - posteriorLikelihoods = new double[genotypePriorByAllele.length]; - for ( int genoIdx = 0; genoIdx < likelihoods.length; genoIdx ++ ) { - posteriorLikelihoods[genoIdx] = likelihoods[genoIdx] + genotypePriorByAllele[genoIdx]; - } - - posteriorLikelihoods = MathUtils.toLog10(MathUtils.normalizeFromLog10(posteriorLikelihoods)); - - } - - posteriors.add(posteriorLikelihoods); - } - - return posteriors; - } - - // convenience function for a single genotypelikelihoods array. Just wraps. - protected static double[] calculatePosteriorGLs(final double[] genotypeLikelihoods, - final double[] knownAlleleCountsByAllele, - final int ploidy) { - return calculatePosteriorGLs(Arrays.asList(genotypeLikelihoods),knownAlleleCountsByAllele,ploidy).get(0); - } - - - /** - * Given known allele counts (whether external, from the sample, or both), calculate the prior distribution - * over genotype states. This assumes - * 1) Random sampling of alleles (known counts are unbiased, and frequency estimate is Dirichlet) - * 2) Genotype states are independent (Hardy-Weinberg) - * These assumptions give rise to a Dirichlet-Multinomial distribution of genotype states as a prior - * (the "number of trials" for the multinomial is simply the ploidy) - * @param knownCountsByAllele - the known counts per allele. For an AC=2, AN=12 site this is {10,2} - * @param ploidy - the number of chromosomes in the sample. For now restricted to 2. - * @return - the Dirichlet-Multinomial distribution over genotype states - */ - protected static double[] getDirichletPrior(final double[] knownCountsByAllele, final int ploidy) { - if ( ploidy != 2 ) { - throw new IllegalStateException("Genotype priors not yet implemented for ploidy != 2"); - } - - // multi-allelic format is - // AA AB BB AC BC CC AD BD CD DD ... - final double sumOfKnownCounts = MathUtils.sum(knownCountsByAllele); - final double[] priors = new double[knownCountsByAllele.length*(knownCountsByAllele.length+1)/2]; - int priorIndex = 0; - for ( int allele2 = 0; allele2 < knownCountsByAllele.length; allele2++ ) { - for ( int allele1 = 0; allele1 <= allele2; allele1++) { - final int[] counts = new int[knownCountsByAllele.length]; - counts[allele1] += 1; - counts[allele2] += 1; - priors[priorIndex++] = MathUtils.dirichletMultinomial(knownCountsByAllele,sumOfKnownCounts,counts,ploidy); - } - } - - return priors; - } - - private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { - final int[] ac; - if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { - ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); - } else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); - } else { - ac = new int[context.getAlternateAlleles().size()]; - int idx = 0; - for ( final Allele allele : context.getAlternateAlleles() ) { - ac[idx++] = context.getCalledChrCount(allele); - } - } - - for ( final Allele allele : context.getAlleles() ) { - final int count; - if ( allele.isReference() ) { - if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { - count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac); - } else { - count = context.getCalledChrCount() - (int) MathUtils.sum(ac); - } - } else { - count = ac[context.getAlternateAlleles().indexOf(allele)]; - } - if ( ! counts.containsKey(allele) ) { - counts.put(allele,0); - } - counts.put(allele,count + counts.get(allele)); - } - } - - public static int[] extractInts(final Object integerListContainingVCField) { - List mleList = null; - if ( integerListContainingVCField instanceof List ) { - if ( ((List) integerListContainingVCField).get(0) instanceof String ) { - mleList = new ArrayList<>(((List) integerListContainingVCField).size()); - for ( Object s : ((List)integerListContainingVCField)) { - mleList.add(Integer.parseInt((String) s)); - } - } else { - mleList = (List) integerListContainingVCField; - } - } else if ( integerListContainingVCField instanceof Integer ) { - mleList = Arrays.asList((Integer) integerListContainingVCField); - } else if ( integerListContainingVCField instanceof String ) { - mleList = Arrays.asList(Integer.parseInt((String)integerListContainingVCField)); - } - if ( mleList == null ) - throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+ - VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); - - final int[] mle = new int[mleList.size()]; - - if ( ! ( mleList.get(0) instanceof Integer ) ) { - throw new IllegalStateException("BUG: The AC values should be an Integer, but was "+mleList.get(0).getClass().getCanonicalName()); - } - - for ( int idx = 0; idx < mle.length; idx++) { - mle[idx] = mleList.get(idx); - } - - return mle; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java deleted file mode 100644 index a344eea61..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java +++ /dev/null @@ -1,80 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Comparator; - -/** - * A pair read-likelihood (cost). - */ -public class ReadCost { - public final GATKSAMRecord read; - - /** - * Holds the cost value. Public for convenience, please use with care. - */ - public double cost; - - public ReadCost(final GATKSAMRecord r) { - read = r; - } - - - /** - * Comparator used to sort ReadCosts - */ - public static final Comparator COMPARATOR = new Comparator() { - @Override - public int compare(final ReadCost o1, final ReadCost o2) { - final String s1 = o1.read.getReadName() + (o1.read.getReadPairedFlag() ? (o1.read.getFirstOfPairFlag() ? "/1" : "/2") : ""); - final String s2 = o2.read.getReadName() + (o2.read.getReadPairedFlag() ? (o2.read.getFirstOfPairFlag() ? "/1" : "/2") : ""); - return s1.compareTo(s2); - } - }; - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java deleted file mode 100644 index e1471ab33..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java +++ /dev/null @@ -1,526 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Utility class that error-corrects reads. - * Main idea: An error in a read will appear as a bubble in a k-mer (de Bruijn) graph and such bubble will have very low multiplicity. - * Hence, read errors will appear as "sparse" kmers with very little support. - * Historically, the most common approach to error-correct reads before assembly has been to first compute the kmer spectrum of the reads, - * defined as the kmer composition of a set of reads along with the multiplicity of each kmer. - * First-generation correctors like the Euler corrector (Pevzner 2001) mapped low frequency kmers (kmers appearing say below N times) - * into high frequency ones that lied within a certain Hamming or edit distance. - * This is doable, but has some drawbacks: - * - Kmers used for error correction become tied to kmers used for graph building. - * - Hence, large kmers (desireable for graph building because they can resolve repeats better) are a hindrance for error correction, - * because they are seen less often. - * - After error correction, there is no guarantee that a sequence of kmers corresponds to an "actual" read. - * - * An error-corrected set of reads also makes a much smoother graph without the need to resolving so many bubbles. - * - * Idea hence is to correct reads based on their kmer content, but in a context independent from graph building. - * In order to do this, the following steps are taken: - * - The k-mer spectrum of a set of reads in computed. However, we are at freedom to choose the most convenient k-mer size (typicially around - * read length /2). - * - We partition the set of observed k-mers into "solid" kmers which have multiplicity > M, and "insolid" ones otherwise (Pevzner 2001). - * - * - Main idea of the algorithm is to try to substitute a sequence of bases in a read by a sequence better supported by kmers. - * - For each "unsolid" kmer observed in reads, we try to find a "solid" kmer within a maximum Hamming distance. - * - If such solid kmer exists, then this unsolid kmer is "correctable", otherwise, uncorrectable. - * - For each read, then: - * -- Walk through read and visit all kmers. - * -- If kmer is solid, continue to next kmer. - * -- If not, and if it's correctable (i.e. there exists a mapping from an unsolid kmer to a solid kmer within a given Hamming distance), - * add the bases and offsets corresponding to differing positions between unsolid and solid kmer to correction list. - * -- At the end, each base in read will have a list of corrections associated with it. We can then choose to correct or not. - * If read has only consistent corrections, then we can correct base to common base in corrections. - * - * TODO: - * todo Q: WHAT QUALITY TO USE?? - * todo how do we deal with mate pairs? - * - * - - - */ -public class ReadErrorCorrector { - private final static Logger logger = Logger.getLogger(ReadErrorCorrector.class); - /** - * A map of for each kmer to its num occurrences in addKmers - */ - KMerCounter countsByKMer; - - Map kmerCorrectionMap = new HashMap<>(); - Map> kmerDifferingBases = new HashMap<>(); - private final int kmerLength; - private final boolean debug; - private final boolean trimLowQualityBases; - private final byte minTailQuality; - private final int maxMismatchesToCorrect; - private final byte qualityOfCorrectedBases; - private final int maxObservationsForKmerToBeCorrectable; - private final int maxHomopolymerLengthInRegion; - private final int minObservationsForKmerToBeSolid; - - // default values, for debugging - private final static boolean doInplaceErrorCorrection = false; // currently not used, since we want corrected reads to be used only for assembly - private final static int MAX_MISMATCHES_TO_CORRECT = 2; - private final static byte QUALITY_OF_CORRECTED_BASES = 30; // what's a reasonable value here? - private final static int MAX_OBSERVATIONS_FOR_KMER_TO_BE_CORRECTABLE = 1; - private final static boolean TRIM_LOW_QUAL_TAILS = false; - private final static boolean DONT_CORRECT_IN_LONG_HOMOPOLYMERS = false; - private final static int MAX_HOMOPOLYMER_THRESHOLD = 12; - - // debug counter structure - private final ReadErrorCorrectionStats readErrorCorrectionStats = new ReadErrorCorrectionStats(); - - /** - * Create a new kmer corrector - * - * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 - * @param maxMismatchesToCorrect e >= 0 - * @param qualityOfCorrectedBases Bases to be corrected will be assigned this quality - */ - public ReadErrorCorrector(final int kmerLength, - final int maxMismatchesToCorrect, - final int maxObservationsForKmerToBeCorrectable, - final byte qualityOfCorrectedBases, - final int minObservationsForKmerToBeSolid, - final boolean trimLowQualityBases, - final byte minTailQuality, - final boolean debug, - final byte[] fullReferenceWithPadding) { - if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); - if ( maxMismatchesToCorrect < 1 ) - throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect); - if ( qualityOfCorrectedBases < 2 || qualityOfCorrectedBases > QualityUtils.MAX_REASONABLE_Q_SCORE) - throw new IllegalArgumentException("qualityOfCorrectedBases must be >= 2 and <= MAX_REASONABLE_Q_SCORE but got " + qualityOfCorrectedBases); - - countsByKMer = new KMerCounter(kmerLength); - this.kmerLength = kmerLength; - this.maxMismatchesToCorrect = maxMismatchesToCorrect; - this.qualityOfCorrectedBases = qualityOfCorrectedBases; - this.minObservationsForKmerToBeSolid = minObservationsForKmerToBeSolid; - this.trimLowQualityBases = trimLowQualityBases; - this.minTailQuality = minTailQuality; - this.debug = debug; - this.maxObservationsForKmerToBeCorrectable = maxObservationsForKmerToBeCorrectable; - - // when region has long homopolymers, we may want not to correct reads, since assessment is complicated, - // so we may decide to skip error correction in these regions - maxHomopolymerLengthInRegion = computeMaxHLen(fullReferenceWithPadding); - } - - /** - * Simple constructor with sensible defaults - * @param kmerLength K-mer length for error correction (not necessarily the same as for assembly graph) - * @param minTailQuality Minimum tail quality: remaining bases with Q's below this value are hard-clipped after correction - * @param debug Output debug information - */ - public ReadErrorCorrector(final int kmerLength, final byte minTailQuality, final int minObservationsForKmerToBeSolid, final boolean debug,final byte[] fullReferenceWithPadding) { - this(kmerLength, MAX_MISMATCHES_TO_CORRECT, MAX_OBSERVATIONS_FOR_KMER_TO_BE_CORRECTABLE, QUALITY_OF_CORRECTED_BASES, minObservationsForKmerToBeSolid, TRIM_LOW_QUAL_TAILS, minTailQuality, debug,fullReferenceWithPadding); - } - - /** - * Main entry routine to add all kmers in a read to the read map counter - * @param read Read to add bases - */ - @Requires("read != null") - protected void addReadKmers(final GATKSAMRecord read) { - if (DONT_CORRECT_IN_LONG_HOMOPOLYMERS && maxHomopolymerLengthInRegion > MAX_HOMOPOLYMER_THRESHOLD) - return; - - final byte[] readBases = read.getReadBases(); - for (int offset = 0; offset <= readBases.length-kmerLength; offset++ ) { - countsByKMer.addKmer(new Kmer(readBases,offset,kmerLength),1); - - } - } - - /** - * Correct a collection of reads based on stored k-mer counts - * @param reads - */ - public final List correctReads(final Collection reads) { - - final List correctedReads = new ArrayList<>(reads.size()); - if (DONT_CORRECT_IN_LONG_HOMOPOLYMERS && maxHomopolymerLengthInRegion > MAX_HOMOPOLYMER_THRESHOLD) { - // just copy reads into output and exit - correctedReads.addAll(reads); - } - else { - computeKmerCorrectionMap(); - for (final GATKSAMRecord read: reads) { - final GATKSAMRecord correctedRead = correctRead(read); - if (trimLowQualityBases) - correctedReads.add(ReadClipper.hardClipLowQualEnds(correctedRead, minTailQuality)); - else - correctedReads.add(correctedRead); - } - if (debug) { - logger.info("Number of corrected bases:"+readErrorCorrectionStats.numBasesCorrected); - logger.info("Number of corrected reads:"+readErrorCorrectionStats.numReadsCorrected); - logger.info("Number of skipped reads:"+readErrorCorrectionStats.numReadsUncorrected); - logger.info("Number of solid kmers:"+readErrorCorrectionStats.numSolidKmers); - logger.info("Number of corrected kmers:"+readErrorCorrectionStats.numCorrectedKmers); - logger.info("Number of uncorrectable kmers:"+readErrorCorrectionStats.numUncorrectableKmers); - } - } - return correctedReads; - } - - - /** - * Do actual read correction based on k-mer map. First, loop through stored k-mers to get a list of possible corrections - * for each position in the read. Then correct read based on all possible consistent corrections. - * @param inputRead Read to correct - * @return Corrected read (can be same reference as input if doInplaceErrorCorrection is set) - */ - @Requires("inputRead != null") - private GATKSAMRecord correctRead(final GATKSAMRecord inputRead) { - // no support for reduced reads (which shouldn't need to be error-corrected anyway!) - if (inputRead.isReducedRead()) - return inputRead; - - // do actual correction - boolean corrected = false; - final byte[] correctedBases = inputRead.getReadBases(); - final byte[] correctedQuals = inputRead.getBaseQualities(); - - // array to store list of possible corrections for read - final CorrectionSet correctionSet = buildCorrectionMap(correctedBases); - - for (int offset = 0; offset < correctedBases.length; offset++) { - final Byte b = correctionSet.getConsensusCorrection(offset); - if (b != null && b != correctedBases[offset]) { - correctedBases[offset] = b; - correctedQuals[offset] = qualityOfCorrectedBases; - corrected = true; - } - readErrorCorrectionStats.numBasesCorrected++; - } - - if (corrected) { - readErrorCorrectionStats.numReadsCorrected++; - if (doInplaceErrorCorrection) { - inputRead.setReadBases(correctedBases); - inputRead.setBaseQualities(correctedQuals); - return inputRead; - } - else { - GATKSAMRecord correctedRead = new GATKSAMRecord(inputRead); - - // do the actual correction - // todo - do we need to clone anything else from read? - correctedRead.setBaseQualities(inputRead.getBaseQualities()); - correctedRead.setIsStrandless(inputRead.isStrandless()); - correctedRead.setReadBases(inputRead.getReadBases()); - correctedRead.setReadString(inputRead.getReadString()); - correctedRead.setReadGroup(inputRead.getReadGroup()); - return correctedRead; - } - } - else { - readErrorCorrectionStats.numReadsUncorrected++; - return inputRead; - } - } - - /** - * Build correction map for each of the bases in read. - * For each of the constituent kmers in read: - * a) See whether the kmer has been mapped to a corrected kmer. - * b) If so, get list of differing positions and corresponding bases. - * c) Add then list of new bases to index in correction list. - * Correction list is of read size, and holds a list of bases to correct. - * @param correctedBases Bases to attempt to correct - * @return CorrectionSet object. - */ - @Requires("correctedBases != null") - private CorrectionSet buildCorrectionMap(final byte[] correctedBases) { - // array to store list of possible corrections for read - final CorrectionSet correctionSet = new CorrectionSet(correctedBases.length); - - for (int offset = 0; offset <= correctedBases.length-kmerLength; offset++ ) { - final Kmer kmer = new Kmer(correctedBases,offset,kmerLength); - final Kmer newKmer = kmerCorrectionMap.get(kmer); - if (newKmer != null && !newKmer.equals(kmer)){ - final Pair differingPositions = kmerDifferingBases.get(kmer); - final int[] differingIndeces = differingPositions.first; - final byte[] differingBases = differingPositions.second; - - for (int k=0; k < differingIndeces.length; k++) { - // get list of differing positions for corrected kmer - // for each of these, add correction candidate to correction set - correctionSet.add(offset + differingIndeces[k],differingBases[k]); - } - } - } - return correctionSet; - } - - - /** - * Top-level entry point that adds a collection of reads to our kmer list. - * For each read in list, its constituent kmers will be logged in our kmer table. - * @param reads - */ - @Requires("reads != null") - public void addReadsToKmers(final Collection reads) { - for (final GATKSAMRecord read: reads) - addReadKmers(read); - - if (debug) - for ( final KMerCounter.CountedKmer countedKmer: countsByKMer.getCountedKmers() ) - logger.info(String.format("%s\t%d\n", countedKmer.kmer, countedKmer.count)); - } - - - /** - * For each kmer we've seen, do the following: - * a) If kmer count > threshold1, this kmer is good, so correction map will be to itself. - * b) If kmer count <= threshold2, this kmer is bad. - * In that case, loop through all other kmers. If kmer is good, compute distance, and get minimal distance. - * If such distance is < some threshold, map to this kmer, and record differing positions and bases. - * - */ - private void computeKmerCorrectionMap() { - for (final KMerCounter.CountedKmer storedKmer : countsByKMer.getCountedKmers()) { - if (storedKmer.getCount() >= minObservationsForKmerToBeSolid) { - // this kmer is good: map to itself - kmerCorrectionMap.put(storedKmer.getKmer(),storedKmer.getKmer()); - kmerDifferingBases.put(storedKmer.getKmer(),new Pair<>(new int[0],new byte[0])); // dummy empty array - readErrorCorrectionStats.numSolidKmers++; - } - else if (storedKmer.getCount() <= maxObservationsForKmerToBeCorrectable) { - // loop now thru all other kmers to find nearest neighbor - final Pair> nearestNeighbor = findNearestNeighbor(storedKmer.getKmer(),countsByKMer,maxMismatchesToCorrect); - - // check if nearest neighbor lies in a close vicinity. If so, log the new bases and the correction map - if (nearestNeighbor != null) { // ok, found close neighbor - kmerCorrectionMap.put(storedKmer.getKmer(), nearestNeighbor.first); - kmerDifferingBases.put(storedKmer.getKmer(), nearestNeighbor.second); - readErrorCorrectionStats.numCorrectedKmers++; -// if (debug) -// logger.info("Original kmer:"+storedKmer + "\tCorrected kmer:"+nearestNeighbor.first+"\tDistance:"+dist); - } - else - readErrorCorrectionStats.numUncorrectableKmers++; - - } - } - } - - /** - * Finds nearest neighbor of a given k-mer, among a list of counted K-mers, up to a given distance. - * If many k-mers share same closest distance, an arbitrary k-mer is picked - * @param kmer K-mer of interest - * @param countsByKMer KMerCounter storing set of counted k-mers (may include kmer of interest) - * @param maxDistance Maximum distance to search - * @return Pair of values: closest K-mer in Hamming distance and list of differing bases. - * If no neighbor can be found up to given distance, returns null - */ - @Requires({"kmer != null", "countsByKMer != null","maxDistance >= 1"}) - private Pair> findNearestNeighbor(final Kmer kmer, - final KMerCounter countsByKMer, - final int maxDistance) { - int minimumDistance = Integer.MAX_VALUE; - Kmer closestKmer = null; - - final int[] differingIndeces = new int[maxDistance+1]; - final byte[] differingBases = new byte[maxDistance+1]; - - final int[] closestDifferingIndices = new int[maxDistance+1]; - final byte[] closestDifferingBases = new byte[maxDistance+1]; - - for (final KMerCounter.CountedKmer candidateKmer : countsByKMer.getCountedKmers()) { - // skip if candidate set includes test kmer - if (candidateKmer.getKmer().equals(kmer)) - continue; - - final int hammingDistance = kmer.getDifferingPositions(candidateKmer.getKmer(), maxDistance, differingIndeces, differingBases); - if (hammingDistance < 0) // can't compare kmer? skip - continue; - - if (hammingDistance < minimumDistance) { - minimumDistance = hammingDistance; - closestKmer = candidateKmer.getKmer(); - System.arraycopy(differingBases,0,closestDifferingBases,0,differingBases.length); - System.arraycopy(differingIndeces,0,closestDifferingIndices,0,differingIndeces.length); - } - } - return new Pair<>(closestKmer, new Pair<>(closestDifferingIndices,closestDifferingBases)); - } - - - /** - * experimental function to compute max homopolymer length in a given reference context - * @param fullReferenceWithPadding Reference context of interest - * @return Max homopolymer length in region - */ - @Requires("fullReferenceWithPadding != null") - private static int computeMaxHLen(final byte[] fullReferenceWithPadding) { - - int leftRun = 1; - int maxRun = 1; - for ( int i = 1; i < fullReferenceWithPadding.length; i++) { - if ( fullReferenceWithPadding[i] == fullReferenceWithPadding[i-1] ) - leftRun++; - else - leftRun = 1; - } - if (leftRun > maxRun) - maxRun = leftRun; - - - return maxRun; - } - - private static final class ReadErrorCorrectionStats { - public int numReadsCorrected; - public int numReadsUncorrected; - public int numBasesCorrected; - public int numSolidKmers; - public int numUncorrectableKmers; - public int numCorrectedKmers; - } - - /** - * Wrapper utility class that holds, for each position in read, a list of bytes representing candidate corrections. - * So, a read ACAGT where the middle A has found to be errorful might look like: - * 0: {} - * 1: {} - * 2: {'C','C','C'} - * 3: {} - * 4: {} - * - * It's up to the method getConsensusCorrection() to decide how to use the correction sets for each position. - * By default, only strict consensus is allowed right now. - * - */ - protected static class CorrectionSet { - private final int size; - private ArrayList> corrections; - - /** - * Main class constructor. - * @param size Size of correction set, needs to be set equal to the read being corrected - */ - public CorrectionSet(final int size) { - this.size = size; - corrections = new ArrayList<>(size); - for (int k=0; k < size; k++) - corrections.add(k,new ArrayList()); - } - - /** - * Add a base to this correction set at a particular offset, measured from the start of the read - * @param offset Offset from start of read - * @param base base to be added to list of corrections at this offset - */ - public void add(final int offset, final byte base) { - if (offset >= size || offset < 0) - throw new IllegalStateException("Bad entry into CorrectionSet: offset > size"); - if (!BaseUtils.isRegularBase(base)) - return; // no irregular base correction - - final List storedBytes = corrections.get(offset); - storedBytes.add(base); - } - - /** - * Get list of corrections for a particular offset - * @param offset Offset of interest - * @return List of bases representing possible corrections at this offset - */ - public List get(final int offset) { - if (offset >= size || offset < 0) - throw new IllegalArgumentException("Illegal call of CorrectionSet.get(): offset must be < size"); - return corrections.get(offset); - } - - /** - * Get consensus correction for a particular offset. In this implementation, it just boils down to seeing if - * byte list associated with offset has identical values. If so, return this base, otherwise return null. - * @param offset - * @return Consensus base, or null if no consensus possible. - */ - public Byte getConsensusCorrection(final int offset) { - if (offset >= size || offset < 0) - throw new IllegalArgumentException("Illegal call of CorrectionSet.getConsensusCorrection(): offset must be < size"); - final List storedBytes = corrections.get(offset); - if (storedBytes.isEmpty()) - return null; - - // todo - is there a cheaper/nicer way to compare if all elements in list are identical?? - final byte lastBase = storedBytes.remove(storedBytes.size()-1); - for (final Byte b: storedBytes) { - // strict correction rule: all bases must match - if (b != lastBase) - return null; - } - - // all bytes then are equal: - return lastBase; - - } - - - - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java deleted file mode 100644 index 9020e3426..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java +++ /dev/null @@ -1,112 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.concurrent.atomic.AtomicLong; - -/** - * Path cost indicate the cost (alignment likelihood) of traversing a section of the graph using a segement of a read. - * - *

A path can be a whole haplotype path as well as just a smaller haplotype segment

. - * - *

We would generate many of this objects for each read. The final likelihood of a read vs each haplotype - * would be the summation of the path-cost of that read along the corresponding haplotype path.

- */ -class ReadSegmentCost { - - public Route path; - public GATKSAMRecord read; - - /** - * Holds the cost value. It public and non-final for convenience. - */ - protected double cost; - - /** - * Caches the path bases (the haplotype segment bases). - */ - protected byte[] bases; - - /** - * Construct a new path cost. - * @param read the corresponding read. - * @param path the corresponding path. - * @param cost initial cost estimate. Might be updated later. - */ - @Requires("route != null") - public ReadSegmentCost(final GATKSAMRecord read, - final Route path, double cost) { - this.read = read; - this.path = path; - this.cost = cost; - } - - /** - * Used to generate unique identifiers for path cost object. - */ - private static final AtomicLong pathCostUniqueIdGenerator = new AtomicLong(); - - /** - * Holds the path cost unique identifier. - */ - private Long uniqueId; - - /** - * Returns the this path-cost unique identifier. - * @return - */ - public long uniqueId() { - if (uniqueId == null) - uniqueId = pathCostUniqueIdGenerator.incrementAndGet(); - return uniqueId; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java deleted file mode 100644 index 4ec56f706..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ /dev/null @@ -1,513 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine; - -import java.io.File; -import java.util.*; - -/** - * Code for estimating the reference confidence - * - * This code can estimate the probability that the data for a single sample is consistent with a - * well-determined REF/REF diploid genotype. - * - * User: depristo - * Date: 6/21/13 - * Time: 12:52 PM - */ -public class ReferenceConfidenceModel { - - //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now - public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele - - private final GenomeLocParser genomeLocParser; - private final Set samples; - private final SAMFileHeader header; // TODO -- really shouldn't depend on this - private final int indelInformativeDepthIndelSize; - - private final static boolean WRITE_DEBUGGING_BAM = false; - private final SAMFileWriter debuggingWriter; - - private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; - - /** - * Create a new ReferenceConfidenceModel - * - * @param genomeLocParser how we create genome locs - * @param samples the list of all samples we'll be considering with this model - * @param header the SAMFileHeader describing the read information (used for debugging) - * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths - */ - public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, - final Set samples, - final SAMFileHeader header, - final int indelInformativeDepthIndelSize) { - if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); - if ( samples == null ) throw new IllegalArgumentException("samples cannot be null"); - if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty"); - if ( header == null ) throw new IllegalArgumentException("header cannot be empty"); - if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize); - - this.genomeLocParser = genomeLocParser; - this.samples = samples; - this.header = header; - this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; - - if ( WRITE_DEBUGGING_BAM ) { - final SAMFileWriterFactory factory = new SAMFileWriterFactory(); - factory.setCreateIndex(true); - debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam")); - } else { - debuggingWriter = null; - } - - initializeIndelPLCache(); - } - - /** - * Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence - * @return a non-null set of VCFHeaderLines - */ - public Set getVCFHeaderLines() { - final Set headerLines = new LinkedHashSet<>(); - // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? - headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); - //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); - return headerLines; - } - - /** - * Close down this reference model, closing down any debugging information opened during execution - */ - public void close() { - if ( debuggingWriter != null ) debuggingWriter.close(); - } - - - /** - * Calculate the reference confidence for a single sample given the its read data - * - * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing - * detailed information about the certainty that the sample is hom-ref for each base in the region. - * - * - * - * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() - * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the - * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. - * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) - * @param activeRegion the active region we want to get the reference confidence over - * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes - * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the - * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence - * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) - * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence - * contexts as well as calls from variantCalls if any were provided - */ - public List calculateRefConfidence(final Haplotype refHaplotype, - final Collection calledHaplotypes, - final GenomeLoc paddedReferenceLoc, - final ActiveRegion activeRegion, - final Map stratifiedReadMap, - final List variantCalls) { - if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); - if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); - if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); - if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); - if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); - if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); - if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); - if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); - - final GenomeLoc refSpan = activeRegion.getLocation(); - final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); - final byte[] ref = refHaplotype.getBases(); - final List results = new ArrayList<>(refSpan.size()); - final String sampleName = stratifiedReadMap.keySet().iterator().next(); - - final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); - for ( final ReadBackedPileup pileup : refPileups ) { - final GenomeLoc curPos = pileup.getLocation(); - final int offset = curPos.getStart() - refSpan.getStart(); - - final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls); - if ( overlappingSite != null ) { - // we have some overlapping site, add it to the list of positions - if ( overlappingSite.getStart() == curPos.getStart() ) - results.add(overlappingSite); - } else { - // otherwise emit a reference confidence variant context - final int refOffset = offset + globalRefOffset; - final byte refBase = ref[refOffset]; - final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); - homRefCalc.capByHomRefLikelihood(); - - final Allele refAllele = Allele.create(refBase, true); - final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); - final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); - gb.AD(homRefCalc.AD_Ref_Any); - gb.DP(homRefCalc.getDP()); - - // genotype likelihood calculation - final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods); - final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize); - final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads); - - // now that we have the SNP and indel GLs, we take the one with the least confidence, - // as this is the most conservative estimate of our certainty that we are hom-ref. - // For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000 - // we are very certain that there's no indel here, but the SNP confidence imply that we are - // far less confident that the ref base is actually the only thing here. So we take 0,10,100 - // as our GLs for the site. - final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs); - - gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); - gb.PL(leastConfidenceGLs.getAsPLs()); - //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); - - vcb.genotypes(gb.make()); - results.add(vcb.make()); -// logger.info(" => VariantContext " + vcb.make()); - } - } - - return results; - } - - /** - * Get the GenotypeLikelihoods with the least strong corresponding GQ value - * @param gl1 first to consider (cannot be null) - * @param gl2 second to consider (cannot be null) - * @return gl1 or gl2, whichever has the worst GQ - */ - protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) { - return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2; - } - - /** - * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site - * - * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site - * @return non-null GenotypeLikelihoods given N - */ - protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { - return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; - } - - protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway - private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; - private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp - - private void initializeIndelPLCache() { - for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { - final double homRef = 0.0; - final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; - final double homVar = INDEL_ERROR_RATE * nInformativeReads; - indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); - } - } - - /** - * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt - * - * @param pileup the read backed pileup containing the data we want to evaluate - * @param refBase the reference base at this pileup position - * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation - * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips - * @return a RefVsAnyResult genotype call - */ - public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { - final RefVsAnyResult result = new RefVsAnyResult(); - - for( final PileupElement p : pileup ) { - final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); - if( p.isDeletion() || qual > minBaseQual ) { - int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { - AA = 2; - BB = 0; - if( hqSoftClips != null && p.isNextToSoftClip() ) { - hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); - } - result.AD_Ref_Any[1] += p.getRepresentativeCount(); - } else { - result.AD_Ref_Any[0] += p.getRepresentativeCount(); - } - result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); - result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); - result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; - } - } - - return result; - } - - /** - * Get a list of pileups that span the entire active region span, in order, one for each position - */ - private List getPileupsOverReference(final Haplotype refHaplotype, - final Collection calledHaplotypes, - final GenomeLoc paddedReferenceLoc, - final ActiveRegion activeRegion, - final GenomeLoc activeRegionSpan, - final Map stratifiedReadMap) { - - if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); - if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); - if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); - if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); - if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); - if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); - if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); - - List realignedReads; - - if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner - realignedReads = activeRegion.getReads(); - } else { - final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); - final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); - writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves - writer.setOnlyRealignInformativeReads(true); - writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); - realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); - } - - if ( debuggingWriter != null ) - for ( final GATKSAMRecord read : realignedReads ) - debuggingWriter.addAlignment(read); - - final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, - true, genomeLocParser, samples, false); - - final List pileups = new LinkedList<>(); - final int startPos = activeRegionSpan.getStart(); - AlignmentContext next = libs.advanceToLocus(startPos, true); - for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) { - if ( next != null && next.getLocation().getStart() == curPos ) { - pileups.add(next.getBasePileup()); - next = libs.hasNext() ? libs.next() : null; - } else { - // no data, so we create empty pileups - pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos))); - } - } - - return pileups; - } - - /** - * Return the rightmost variant context in maybeOverlapping that overlaps curPos - * - * @param curPos non-null genome loc - * @param maybeOverlapping a collection of variant contexts that might overlap curPos - * @return a VariantContext, or null if none overlaps - */ - protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection maybeOverlapping) { - VariantContext overlaps = null; - for ( final VariantContext vc : maybeOverlapping ) { - if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) { - if ( overlaps == null || vc.getStart() > overlaps.getStart() ) { - overlaps = vc; - } - } - } - return overlaps; - } - - /** - * Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart - * assuming no insertions or deletions in the read w.r.t. the reference - * - * @param readBases non-null bases of the read - * @param readQuals non-null quals of the read - * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) - * @param refBases the reference bases - * @param refStart the offset into refBases that aligns to the readStart position in readBases - * @param maxSum if the sum goes over this value, return immediately - * @return the sum of quality scores for readBases that mismatch their corresponding ref bases - */ - protected final int sumMismatchingQualities(final byte[] readBases, - final byte[] readQuals, - final int readStart, - final byte[] refBases, - final int refStart, - final int maxSum) { - final int n = Math.min(readBases.length - readStart, refBases.length - refStart); - int sum = 0; - - for ( int i = 0; i < n; i++ ) { - final byte readBase = readBases[readStart + i]; - final byte refBase = refBases[refStart + i]; - if ( readBase != refBase ) { - sum += readQuals[readStart + i]; - if ( sum > maxSum ) // abort early - return sum; - } - } - - return sum; - } - - /** - * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart - * - * @param readBases non-null bases of the read - * @param readQuals non-null quals of the read - * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) - * @param refBases the reference bases - * @param refStart the offset into refBases that aligns to the readStart position in readBases - * @param maxIndelSize the max indel size to consider for the read to be informative - * @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart - */ - protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases, - final byte[] readQuals, - final int readStart, - final byte[] refBases, - final int refStart, - final int maxIndelSize) { - // fast exit when n bases left < maxIndelSize - if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { - return false; - } - - final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); - - // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion - // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores - for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) { - for ( final boolean checkInsertion : Arrays.asList(true, false) ) { - final int readI, refI; - if ( checkInsertion ) { - readI = readStart + indelSize; - refI = refStart; - } else { - readI = readStart; - refI = refStart + indelSize; - } - - final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum); - if ( score <= baselineMMSum ) - return false; - } - } - - return true; - } - - /** - * Calculate the number of indel informative reads at pileup - * - * @param pileup a pileup - * @param pileupOffsetIntoRef the position of the pileup in the reference - * @param ref the ref bases - * @param maxIndelSize maximum indel size to consider in the informativeness calculation - * @return an integer >= 0 - */ - protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) { - int nInformative = 0; - for ( final PileupElement p : pileup ) { - final GATKSAMRecord read = p.getRead(); - final int offset = p.getOffset(); - - // doesn't count as evidence - if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) - continue; - - // todo -- this code really should handle CIGARs directly instead of relying on the above tests - if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { - nInformative += p.getRepresentativeCount(); - if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { - return MAX_N_INDEL_INFORMATIVE_READS; - } - } - } - return nInformative; - } - - /** - * Create a reference haplotype for an active region - * - * @param activeRegion the active region - * @param refBases the ref bases - * @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation() - * @return a reference haplotype - */ - public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) { - final Haplotype refHaplotype = new Haplotype(refBases, true); - final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart(); - if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart); - refHaplotype.setAlignmentStartHapwrtRef(alignmentStart); - final Cigar c = new Cigar(); - c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); - refHaplotype.setCigar(c); - return refHaplotype; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java deleted file mode 100644 index edd8dbb16..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ /dev/null @@ -1,716 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.jgrapht.EdgeFactory; -import org.jgrapht.graph.DefaultDirectedGraph; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.*; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 2/6/13 - */ -@Invariant("!this.isAllowingMultipleEdges()") -public class BaseGraph extends DefaultDirectedGraph { - protected final static Logger logger = Logger.getLogger(BaseGraph.class); - protected final int kmerSize; - - /** - * Construct a TestGraph with kmerSize - * @param kmerSize - */ - public BaseGraph(final int kmerSize, final EdgeFactory edgeFactory) { - super(edgeFactory); - - if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); - this.kmerSize = kmerSize; - } - - /** - * How big of a kmer did we use to create this graph? - * @return - */ - public int getKmerSize() { - return kmerSize; - } - - /** - * @param v the vertex to test - * @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph) - */ - public boolean isReferenceNode( final V v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - - for ( final BaseEdge e : edgesOf(v) ) { - if ( e.isRef() ) { return true; } - } - - // edge case: if the graph only has one node then it's a ref node, otherwise it's not - return (vertexSet().size() == 1); - } - - /** - * @param v the vertex to test - * @return true if this vertex is a source node (in degree == 0) - */ - public boolean isSource( final V v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - return inDegreeOf(v) == 0; - } - - /** - * @param v the vertex to test - * @return true if this vertex is a sink node (out degree == 0) - */ - public boolean isSink( final V v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - return outDegreeOf(v) == 0; - } - - /** - * Get the set of source vertices of this graph - * @return a non-null set - */ - public Set getSources() { - final Set set = new LinkedHashSet(); - for ( final V v : vertexSet() ) - if ( isSource(v) ) - set.add(v); - return set; - } - - /** - * Get the set of sink vertices of this graph - * @return a non-null set - */ - public Set getSinks() { - final Set set = new LinkedHashSet(); - for ( final V v : vertexSet() ) - if ( isSink(v) ) - set.add(v); - return set; - } - - /** - * Pull out the additional sequence implied by traversing this node in the graph - * @param v the vertex from which to pull out the additional base sequence - * @return non-null byte array - */ - @Ensures({"result != null"}) - public byte[] getAdditionalSequence( final V v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); } - return v.getAdditionalSequence(isSource(v)); - } - - /** - * @param v the vertex to test - * @return true if this vertex is a reference source - */ - public boolean isRefSource( final V v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - - // confirm that no incoming edges are reference edges - for ( final E edgeToTest : incomingEdgesOf(v) ) { - if ( edgeToTest.isRef() ) { return false; } - } - - // confirm that there is an outgoing reference edge - for ( final E edgeToTest : outgoingEdgesOf(v) ) { - if ( edgeToTest.isRef() ) { return true; } - } - - // edge case: if the graph only has one node then it's a ref sink, otherwise it's not - return (vertexSet().size() == 1); - } - - /** - * @param v the vertex to test - * @return true if this vertex is a reference sink - */ - public boolean isRefSink( final V v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - - // confirm that no outgoing edges are reference edges - for ( final E edgeToTest : outgoingEdgesOf(v) ) { - if ( edgeToTest.isRef() ) { return false; } - } - - // confirm that there is an incoming reference edge - for ( final E edgeToTest : incomingEdgesOf(v) ) { - if ( edgeToTest.isRef() ) { return true; } - } - - // edge case: if the graph only has one node then it's a ref source, otherwise it's not - return (vertexSet().size() == 1); - } - - /** - * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph - */ - public V getReferenceSourceVertex( ) { - for( final V v : vertexSet() ) { - if( isRefSource(v) ) { - return v; - } - } - return null; - } - - /** - * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph - */ - public V getReferenceSinkVertex( ) { - for( final V v : vertexSet() ) { - if( isRefSink(v) ) { - return v; - } - } - return null; - } - - /** - * Traverse the graph and get the next reference vertex if it exists - * @param v the current vertex, can be null - * @return the next reference vertex if it exists - */ - public V getNextReferenceVertex( final V v ) { - if( v == null ) { return null; } - for( final E edgeToTest : outgoingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { - return getEdgeTarget(edgeToTest); - } - } - return null; - } - - /** - * Traverse the graph and get the previous reference vertex if it exists - * @param v the current vertex, can be null - * @return the previous reference vertex if it exists - */ - public V getPrevReferenceVertex( final V v ) { - if( v == null ) { return null; } - for( final E edgeToTest : incomingEdgesOf(v) ) { - if( isReferenceNode(getEdgeSource(edgeToTest)) ) { - return getEdgeSource(edgeToTest); - } - } - return null; - } - - /** - * Does a reference path exist between the two vertices? - * @param fromVertex from this vertex, can be null - * @param toVertex to this vertex, can be null - * @return true if a reference path exists in the graph between the two vertices - */ - public boolean referencePathExists(final V fromVertex, final V toVertex) { - V v = fromVertex; - if( v == null ) { - return false; - } - v = getNextReferenceVertex(v); - if( v == null ) { - return false; - } - while( !v.equals(toVertex) ) { - v = getNextReferenceVertex(v); - if( v == null ) { - return false; - } - } - return true; - } - - /** - * Walk along the reference path in the graph and pull out the corresponding bases - * @param fromVertex starting vertex - * @param toVertex ending vertex - * @param includeStart should the starting vertex be included in the path - * @param includeStop should the ending vertex be included in the path - * @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example) - */ - public byte[] getReferenceBytes( final V fromVertex, final V toVertex, final boolean includeStart, final boolean includeStop ) { - if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); } - if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); } - - byte[] bytes = null; - V v = fromVertex; - if( includeStart ) { - bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); - } - v = getNextReferenceVertex(v); // advance along the reference path - while( v != null && !v.equals(toVertex) ) { - bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); - v = getNextReferenceVertex(v); // advance along the reference path - } - if( includeStop && v != null && v.equals(toVertex)) { - bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); - } - return bytes; - } - - /** - * Convenience function to add multiple vertices to the graph at once - * @param vertices one or more vertices to add - */ - public void addVertices(final V... vertices) { - for ( final V v : vertices ) - addVertex(v); - } - - /** - * Convenience function to add multiple vertices to the graph at once - * @param vertices one or more vertices to add - */ - public void addVertices(final Collection vertices) { - for ( final V v : vertices ) - addVertex(v); - } - - /** - * Convenience function to add multiple edges to the graph - * @param start the first vertex to connect - * @param remaining all additional vertices to connect - */ - public void addEdges(final V start, final V... remaining) { - V prev = start; - for ( final V next : remaining ) { - addEdge(prev, next); - prev = next; - } - } - - /** - * Convenience function to add multiple edges to the graph - * @param start the first vertex to connect - * @param remaining all additional vertices to connect - */ - public void addEdges(final E template, final V start, final V... remaining) { - V prev = start; - for ( final V next : remaining ) { - addEdge(prev, next, (E)(template.copy())); // TODO -- is there a better way to do this? - prev = next; - } - } - - /** - * Get the set of vertices connected by outgoing edges of V - * @param v a non-null vertex - * @return a set of vertices connected by outgoing edges from v - */ - public Set outgoingVerticesOf(final V v) { - final Set s = new LinkedHashSet(); - for ( final E e : outgoingEdgesOf(v) ) { - s.add(getEdgeTarget(e)); - } - return s; - } - - /** - * Get the set of vertices connected to v by incoming edges - * @param v a non-null vertex - * @return a set of vertices {X} connected X -> v - */ - public Set incomingVerticesOf(final V v) { - final Set s = new LinkedHashSet(); - for ( final E e : incomingEdgesOf(v) ) { - s.add(getEdgeSource(e)); - } - return s; - } - - /** - * Get the set of vertices connected to v by incoming or outgoing edges - * @param v a non-null vertex - * @return a set of vertices {X} connected X -> v or v -> Y - */ - public Set neighboringVerticesOf(final V v) { - final Set s = incomingVerticesOf(v); - s.addAll(outgoingVerticesOf(v)); - return s; - } - - /** - * Print out the graph in the dot language for visualization - * @param destination File to write to - */ - public void printGraph(final File destination, final int pruneFactor) { - PrintStream stream = null; - - try { - stream = new PrintStream(new FileOutputStream(destination)); - printGraph(stream, true, pruneFactor); - } catch ( FileNotFoundException e ) { - throw new RuntimeException(e); - } finally { - if ( stream != null ) stream.close(); - } - } - - public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) { - if ( writeHeader ) - graphWriter.println("digraph assemblyGraphs {"); - - for( final E edge : edgeSet() ) { - graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];"); - if( edge.isRef() ) { - graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); - } - } - - for( final V v : vertexSet() ) { -// graphWriter.println("\t" + v.toString() + " [label=\"" + v + "\",shape=box]"); - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]"); - } - - if ( writeHeader ) - graphWriter.println("}"); - } - - /** - * Remove edges that are connected before the reference source and after the reference sink - * - * Also removes all vertices that are orphaned by this process - */ - public void cleanNonRefPaths() { - if( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) { - return; - } - - // Remove non-ref edges connected before and after the reference path - final Set edgesToCheck = new HashSet(); - edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex())); - while( !edgesToCheck.isEmpty() ) { - final E e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) ); - removeEdge(e); - } - edgesToCheck.remove(e); - } - - edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex())); - while( !edgesToCheck.isEmpty() ) { - final E e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) ); - removeEdge(e); - } - edgesToCheck.remove(e); - } - - removeSingletonOrphanVertices(); - } - - /** - * Prune all chains from this graph where any edge in the path has multiplicity < pruneFactor - * - * @see LowWeightChainPruner for more information - * - * @param pruneFactor all edges with multiplicity < this factor that aren't ref edges will be removed - */ - public void pruneLowWeightChains( final int pruneFactor ) { - final LowWeightChainPruner pruner = new LowWeightChainPruner<>(pruneFactor); - pruner.pruneLowWeightChains(this); - } - - /** - * Remove all vertices in the graph that have in and out degree of 0 - */ - public void removeSingletonOrphanVertices() { - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new LinkedList<>(); - for( final V v : vertexSet() ) { - if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 && !isRefSource(v) ) { - verticesToRemove.add(v); - } - } - removeAllVertices(verticesToRemove); - } - - /** - * Remove all vertices on the graph that cannot be accessed by following any edge, - * regardless of its direction, from the reference source vertex - */ - public void removeVerticesNotConnectedToRefRegardlessOfEdgeDirection() { - final HashSet toRemove = new HashSet<>(vertexSet()); - - final V refV = getReferenceSourceVertex(); - if ( refV != null ) { - for ( final V v : new BaseGraphIterator<>(this, refV, true, true) ) { - toRemove.remove(v); - } - } - - removeAllVertices(toRemove); - } - - /** - * Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex - * - * More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection, - * as it requires vertices to not only be connected by a series of directed edges but also prunes away - * paths that do not also meet eventually with the reference sink vertex - */ - public void removePathsNotConnectedToRef() { - if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) { - throw new IllegalStateException("Graph must have ref source and sink vertices"); - } - - // get the set of vertices we can reach by going forward from the ref source - final Set onPathFromRefSource = new HashSet<>(vertexSet().size()); - for ( final V v : new BaseGraphIterator<>(this, getReferenceSourceVertex(), false, true) ) { - onPathFromRefSource.add(v); - } - - // get the set of vertices we can reach by going backward from the ref sink - final Set onPathFromRefSink = new HashSet<>(vertexSet().size()); - for ( final V v : new BaseGraphIterator<>(this, getReferenceSinkVertex(), true, false) ) { - onPathFromRefSink.add(v); - } - - // we want to remove anything that's not in both the sink and source sets - final Set verticesToRemove = new HashSet<>(vertexSet()); - onPathFromRefSource.retainAll(onPathFromRefSink); - verticesToRemove.removeAll(onPathFromRefSource); - removeAllVertices(verticesToRemove); - - // simple sanity checks that this algorithm is working. - if ( getSinks().size() > 1 ) { - throw new IllegalStateException("Should have eliminated all but the reference sink, but found " + getSinks()); - } - - if ( getSources().size() > 1 ) { - throw new IllegalStateException("Should have eliminated all but the reference source, but found " + getSources()); - } - } - - /** - * Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure - * - * By similar this means that both graphs have the same number of vertices, where each vertex can find - * a vertex in the other graph that's seqEqual to it. A similar constraint applies to the edges, - * where all edges in g1 must have a corresponding edge in g2 where both source and target vertices are - * seqEqual - * - * @param g1 the first graph to compare - * @param g2 the second graph to compare - * @param the type of the nodes in those graphs - * @return true if g1 and g2 are equals - */ - public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { - final Set vertices1 = g1.vertexSet(); - final Set vertices2 = g2.vertexSet(); - final Set edges1 = g1.edgeSet(); - final Set edges2 = g2.edgeSet(); - - if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() ) - return false; - - for ( final T v1 : vertices1 ) { - boolean found = false; - for ( final T v2 : vertices2 ) - found = found || v1.getSequenceString().equals(v2.getSequenceString()); - if ( ! found ) return false; - } - - for( final E e1 : g1.edgeSet() ) { - boolean found = false; - for( E e2 : g2.edgeSet() ) { - if( g1.seqEquals(e1, e2, g2) ) { found = true; break; } - } - if( !found ) { return false; } - } - for( final E e2 : g2.edgeSet() ) { - boolean found = false; - for( E e1 : g1.edgeSet() ) { - if( g2.seqEquals(e2, e1, g1) ) { found = true; break; } - } - if( !found ) { return false; } - } - return true; - } - - // For use when comparing edges across graphs! - private boolean seqEquals( final E edge1, final E edge2, final BaseGraph graph2 ) { - return (this.getEdgeSource(edge1).seqEquals(graph2.getEdgeSource(edge2))) && (this.getEdgeTarget(edge1).seqEquals(graph2.getEdgeTarget(edge2))); - } - - - /** - * Get the incoming edge of v. Requires that there be only one such edge or throws an error - * @param v our vertex - * @return the single incoming edge to v, or null if none exists - */ - public E incomingEdgeOf(final V v) { - return getSingletonEdge(incomingEdgesOf(v)); - } - - /** - * Get the outgoing edge of v. Requires that there be only one such edge or throws an error - * @param v our vertex - * @return the single outgoing edge from v, or null if none exists - */ - public E outgoingEdgeOf(final V v) { - return getSingletonEdge(outgoingEdgesOf(v)); - } - - /** - * Helper function that gets the a single edge from edges, null if edges is empty, or - * throws an error is edges has more than 1 element - * @param edges a set of edges - * @return a edge - */ - @Requires("edges != null") - private E getSingletonEdge(final Collection edges) { - if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges); - return edges.isEmpty() ? null : edges.iterator().next(); - } - - /** - * Add edge between source -> target if none exists, or add e to an already existing one if present - * - * @param source source vertex - * @param target vertex - * @param e edge to add - */ - public void addOrUpdateEdge(final V source, final V target, final E e) { - final E prev = getEdge(source, target); - if ( prev != null ) { - prev.add(e); - } else { - addEdge(source, target, e); - } - } - - @Override - public String toString() { - return "BaseGraph{" + - "kmerSize=" + kmerSize + - '}'; - } - - /** - * The base sequence for the given path. - * Note, this assumes that the path does not start with a source node. - * - * @param path the list of vertexes that make up the path - * @return non-null sequence of bases corresponding to the given path - */ - @Ensures({"result != null"}) - public byte[] getBasesForPath(final List path) { - if ( path == null ) throw new IllegalArgumentException("Path cannot be null"); - - final StringBuffer sb = new StringBuffer(); - for ( final DeBruijnVertex v : path ) - sb.append((char)v.getSuffix()); - - return sb.toString().getBytes(); - } - - /** - * Get the set of vertices within distance edges of source, regardless of edge direction - * - * @param source the source vertex to consider - * @param distance the distance - * @return a set of vertices within distance of source - */ - protected Set verticesWithinDistance(final V source, final int distance) { - if ( distance == 0 ) - return Collections.singleton(source); - - final Set found = new HashSet<>(); - found.add(source); - for ( final V v : neighboringVerticesOf(source) ) { - found.addAll(verticesWithinDistance(v, distance - 1)); - } - - return found; - } - - /** - * Get a graph containing only the vertices within distance edges of target - * @param target a vertex in graph - * @param distance the max distance - * @return a non-null graph - */ - public BaseGraph subsetToNeighbors(final V target, final int distance) { - if ( target == null ) throw new IllegalArgumentException("Target cannot be null"); - if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target); - if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance); - - - final Set toKeep = verticesWithinDistance(target, distance); - final Set toRemove = new HashSet<>(vertexSet()); - toRemove.removeAll(toKeep); - - final BaseGraph result = (BaseGraph)clone(); - result.removeAllVertices(toRemove); - - return result; - } - - /** - * Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex - * @return a non-null subgraph of this graph - */ - public BaseGraph subsetToRefSource() { - return subsetToNeighbors(getReferenceSourceVertex(), 10); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java deleted file mode 100644 index ec2ccff20..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import com.google.java.contract.Ensures; - -/** - * simple node class for storing kmer sequences - * - * User: ebanks, mdepristo - * Date: Mar 23, 2011 - */ -public class DeBruijnVertex extends BaseVertex { - private final static byte[][] sufficesAsByteArray = new byte[256][]; - static { - for ( int i = 0; i < sufficesAsByteArray.length; i++ ) - sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)}; - } - - public DeBruijnVertex( final byte[] sequence ) { - super(sequence); - } - - /** - * For testing purposes only - * @param sequence - */ - protected DeBruijnVertex( final String sequence ) { - this(sequence.getBytes()); - } - - /** - * Get the kmer size for this DeBruijnVertex - * @return integer >= 1 - */ - @Ensures("result >= 1") - public int getKmerSize() { - return sequence.length; - } - - /** - * Get the string representation of the suffix of this DeBruijnVertex - * @return a non-null non-empty string - */ - @Ensures({"result != null", "result.length() >= 1"}) - public String getSuffixString() { - return new String(getSuffixAsArray()); - } - - /** - * Get the suffix byte of this DeBruijnVertex - * - * The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT - * getSuffix would return T - * - * @return a byte - */ - public byte getSuffix() { - return sequence[getKmerSize() - 1]; - } - - /** - * Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory. - * - * Should not be modified - * - * @return a byte[] that contains 1 byte == getSuffix() - */ - @Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"}) - private byte[] getSuffixAsArray() { - return sufficesAsByteArray[getSuffix()]; - } - - /** - * {@inheritDoc} - */ - @Override - public byte[] getAdditionalSequence(boolean source) { - return source ? super.getAdditionalSequence(source) : getSuffixAsArray(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java deleted file mode 100644 index 3ba85dd92..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java +++ /dev/null @@ -1,185 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import com.google.common.collect.MinMaxPriorityQueue; -import com.google.java.contract.Ensures; - -import java.io.Serializable; -import java.util.*; - -/** - * Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. - * This is different from most graph traversals because we want to test paths from any source node to any sink node. - * - * User: ebanks, rpoplin, mdepristo - * Date: Mar 23, 2011 - */ -public class KBestPaths { - private final boolean allowCycles; - - /** - * Create a new KBestPaths finder that follows cycles in the graph - */ - public KBestPaths() { - this(true); - } - - /** - * Create a new KBestPaths finder - * - * @param allowCycles should we allow paths that follow cycles in the graph? - */ - public KBestPaths(final boolean allowCycles) { - this.allowCycles = allowCycles; - } - - protected static class MyInt { public int val = 0; } - - /** - * Compare paths such that paths with greater weight are earlier in a list - */ - protected static class PathComparatorTotalScore implements Comparator, Serializable { - @Override - public int compare(final Path path1, final Path path2) { - return path2.getScore() - path1.getScore(); - } - } - - /** - * @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths - */ - public List> getKBestPaths( final BaseGraph graph ) { - return getKBestPaths(graph, 1000); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths - * starting from all source vertices and ending with all sink vertices - */ - public List> getKBestPaths( final BaseGraph graph, final int k ) { - return getKBestPaths(graph, k, graph.getSources(), graph.getSinks()); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 - */ - public List> getKBestPaths( final BaseGraph graph, final Set sources, final Set sinks ) { - return getKBestPaths(graph, 1000, sources, sinks); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 - */ - public List> getKBestPaths( final BaseGraph graph, final T source, final T sink ) { - return getKBestPaths(graph, 1000, source, sink); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets - */ - public List> getKBestPaths( final BaseGraph graph, final int k, final T source, final T sink ) { - return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink)); - } - - /** - * Traverse the graph and pull out the best k paths. - * Paths are scored via their comparator function. The default being PathComparatorTotalScore() - * @param graph the graph from which to pull paths - * @param k the number of paths to find - * @param sources a set of vertices we want to start paths with - * @param sinks a set of vertices we want to end paths with - * @return a list with at most k top-scoring paths from the graph - */ - @Ensures({"result != null", "result.size() <= k"}) - public List> getKBestPaths( final BaseGraph graph, final int k, final Set sources, final Set sinks ) { - if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } - - // a min max queue that will collect the best k paths - final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); - - // run a DFS for best paths - for ( final T source : sources ) { - final Path startingPath = new Path(source, graph); - findBestPaths(startingPath, sinks, bestPaths, new MyInt()); - } - - // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result - final List> toReturn = new ArrayList>(bestPaths); - Collections.sort(toReturn, new PathComparatorTotalScore()); - return toReturn; - } - - /** - * Recursive algorithm to find the K best paths in the graph from the current path to any of the sinks - * @param path the current path progress - * @param sinks a set of nodes that are sinks. Will terminate and add a path if the last vertex of path is in this set - * @param bestPaths a path to collect completed paths. - * @param n used to limit the search by tracking the number of vertices visited across all paths - */ - private void findBestPaths( final Path path, final Set sinks, final Collection> bestPaths, final MyInt n ) { - if ( sinks.contains(path.getLastVertex())) { - bestPaths.add(path); - } else if( n.val > 10000 ) { - // do nothing, just return, as we've done too much work already - } else { - // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); - Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); - for ( final E edge : edgeArrayList ) { - final T target = path.getGraph().getEdgeTarget(edge); - // make sure the edge is not already in the path - final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target); - if ( ! alreadyVisited ) { - final Path newPath = new Path(path, edge); - n.val++; - findBestPaths(newPath, sinks, bestPaths, n); - } - } - } - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java deleted file mode 100644 index 6901d16ef..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ /dev/null @@ -1,466 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import com.google.java.contract.Ensures; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.smithwaterman.*; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; - -import java.util.*; - -/** - * A path thought a BaseGraph - * - * class to keep track of paths - * - * User: depristo - * Date: 3/19/13 - * Time: 2:34 PM - * - */ -public class Path { - private final static String SW_PAD = "NNNNNNNNNN"; - private final static Logger logger = Logger.getLogger(Path.class); - - // the last vertex seen in the path - protected final T lastVertex; - - // the list of edges comprising the path - private Set edgesAsSet = null; - protected final ArrayList edgesInOrder; - - // the scores for the path - protected final int totalScore; - - // the graph from which this path originated - protected final BaseGraph graph; - - // used in the bubble state machine to apply Smith-Waterman to the bubble sequence - // these values were chosen via optimization against the NA12878 knowledge base - public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1); - - /** - * Create a new Path containing no edges and starting at initialVertex - * @param initialVertex the starting vertex of the path - * @param graph the graph this path will follow through - */ - public Path(final T initialVertex, final BaseGraph graph) { - if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); - if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); - if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph); - - lastVertex = initialVertex; - edgesInOrder = new ArrayList<>(0); - totalScore = 0; - this.graph = graph; - } - - /** - * Convenience constructor for testing that creates a path through vertices in graph - */ - protected static Path makePath(final List vertices, final BaseGraph graph) { - Path path = new Path(vertices.get(0), graph); - for ( int i = 1; i < vertices.size(); i++ ) - path = new Path(path, graph.getEdge(path.lastVertex, vertices.get(i))); - return path; - } - - /** - * Create a new path with the same field values. - * - * @param p the template path. - * - * @throws NullPointerException if {@code p} is {@code null}. - */ - protected Path(final Path p) { - this.edgesInOrder = p.edgesInOrder; - this.lastVertex = p.lastVertex; - this.edgesAsSet = p.edgesAsSet; - this.totalScore = p.totalScore; - this.graph = p.graph; - } - - /** - * Create a new Path extending p with edge - * - * @param p the path to extend. - * @param edge the edge to extend path with. - * - * @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is - * not part of {@code p}'s graph, or {@code edge} does not have as a source the last vertex in {@code p}. - */ - public Path(final Path p, final E edge) { - if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); - if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); - if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); - if ( ! p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } - - graph = p.graph; - lastVertex = p.graph.getEdgeTarget(edge); - edgesInOrder = new ArrayList<>(p.length() + 1); - edgesInOrder.addAll(p.edgesInOrder); - edgesInOrder.add(edge); - totalScore = p.totalScore + edge.getMultiplicity(); - } - - /** - * Length of the path in edges. - * - * @return {@code 0} or greater. - */ - public int length() { - return edgesInOrder.size(); - } - - /** - * Prepend a path with an edge. - * - * @param edge the extending edge. - * @param p the original path. - * - * @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is - * not part of {@code p}'s graph, or {@code edge} does not have as a target the first vertex in {@code p}. - */ - public Path(final E edge, final Path p) { - if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); - if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); - if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); - if ( ! p.graph.getEdgeTarget(edge).equals(p.getFirstVertex())) { throw new IllegalStateException("Edges added to path must be contiguous."); } - graph = p.graph; - lastVertex = p.lastVertex; - edgesInOrder = new ArrayList<>(p.length() + 1); - edgesInOrder.add(edge); - edgesInOrder.addAll(p.getEdges()); - totalScore = p.totalScore + edge.getMultiplicity(); - } - - /** - * Get the collection of edges leaving the last vertex of this path - * @return a non-null collection - */ - public Collection getOutgoingEdgesOfLastVertex() { - return getGraph().outgoingEdgesOf(getLastVertex()); - } - - /** - * Does this path contain the given edge - * @param edge the given edge to test - * @return true if the edge is found in this path - */ - public boolean containsEdge( final E edge ) { - if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - if ( edgesInOrder.isEmpty() ) return false; - - // initialize contains cache if necessary - if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder); - return edgesAsSet.contains(edge); - } - - /** - * Does this path contain the given vertex? - * - * @param v a non-null vertex - * @return true if v occurs within this path, false otherwise - */ - public boolean containsVertex(final T v) { - if ( v == null ) throw new IllegalArgumentException("Vertex cannot be null"); - - // TODO -- warning this is expensive. Need to do vertex caching - return getVertices().contains(v); - } - - /** - * Checks whether a given path is a suffix of this path. - * - * @param other the path to compare against. - * @throws IllegalArgumentException if other is null, or the come from - * different graphs. - * @return true if other is a suffix of this path. - */ - public boolean isSuffix(final Path other) { - if ( other == null ) throw new IllegalArgumentException("path cannot be null"); - if (other.getGraph() != this.getGraph()) throw new IllegalArgumentException("the other path most belong to the same path"); - if (!lastVertex.equals(other.lastVertex)) - return false; - final ListIterator myIt = edgesInOrder.listIterator(edgesInOrder.size()); - final ListIterator otherIt = other.edgesInOrder.listIterator(other.edgesInOrder.size()); - while (myIt.hasPrevious() && otherIt.hasPrevious()) - if (otherIt.previous() != myIt.previous()) - return false; - return !otherIt.hasPrevious(); - } - - /** - * Check that two paths have the same edges and total score - * @param path the other path we might be the same as - * @return true if this and path are the same - */ - protected boolean pathsAreTheSame(Path path) { - return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder); - } - - @Override - public String toString() { - final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path="); - boolean first = true; - for ( final T v : getVertices() ) { - if ( first ) - first = false; - else - b.append(" -> "); - b.append(v.getSequenceString()); - } - b.append('}'); - return b.toString(); - } - - /** - * Get the graph of this path - * @return a non-null graph - */ - @Ensures("result != null") - public BaseGraph getGraph() { - return graph; - } - - /** - * Get the edges of this path in order - * @return a non-null list of edges - */ - @Ensures("result != null") - public List getEdges() { return edgesInOrder; } - - /** - * Get the list of vertices in this path in order defined by the edges of the path - * @return a non-null, non-empty list of vertices - */ - @Ensures({"result != null", "!result.isEmpty()"}) - public List getVertices() { - if ( getEdges().isEmpty() ) - return Collections.singletonList(lastVertex); - else { - final LinkedList vertices = new LinkedList(); - boolean first = true; - for ( final E e : getEdges() ) { - if ( first ) { - vertices.add(graph.getEdgeSource(e)); - first = false; - } - vertices.add(graph.getEdgeTarget(e)); - } - return vertices; - } - } - - /** - * Get the total score of this path (bigger is better) - * @return a positive integer - */ - @Ensures("result >= 0") - public int getScore() { return totalScore; } - - /** - * Get the final vertex of the path - * @return a non-null vertex - */ - @Ensures("result != null") - public T getLastVertex() { return lastVertex; } - - /** - * Get the first vertex in this path - * @return a non-null vertex - */ - public T getFirstVertex() { - if (edgesInOrder.size() == 0) { - return lastVertex; - } else { - return getGraph().getEdgeSource(edgesInOrder.get(0)); - } - } - - /** - * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes - * @return non-null sequence of bases corresponding to this path - */ - @Ensures({"result != null"}) - public byte[] getBases() { - if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); } - - byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.get(0))); - for( final E e : edgesInOrder ) { - bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); - } - return bases; - } - - /** - * Calculate the cigar elements for this path against the reference sequence - * - * @param refSeq the reference sequence that all of the bases in this path should align to - * @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found - */ - public Cigar calculateCigar(final byte[] refSeq) { - if ( getBases().length == 0 ) { - // horrible edge case from the unit tests, where this path has no bases - return new Cigar(Arrays.asList(new CigarElement(refSeq.length, CigarOperator.D))); - } - - final byte[] bases = getBases(); - final Cigar nonStandard; - - final String paddedRef = SW_PAD + new String(refSeq) + SW_PAD; - final String paddedPath = SW_PAD + new String(bases) + SW_PAD; - final SmithWaterman alignment = new SWPairwiseAlignment( paddedRef.getBytes(), paddedPath.getBytes(), NEW_SW_PARAMETERS ); - - if ( isSWFailure(alignment) ) - return null; - - // cut off the padding bases - final int baseStart = SW_PAD.length(); - final int baseEnd = paddedPath.length() - SW_PAD.length() - 1; // -1 because it's inclusive - nonStandard = AlignmentUtils.trimCigarByBases(alignment.getCigar(), baseStart, baseEnd); - - if ( nonStandard.getReferenceLength() != refSeq.length ) { - nonStandard.add(new CigarElement(refSeq.length - nonStandard.getReferenceLength(), CigarOperator.D)); - } - - // finally, return the cigar with all indels left aligned - return leftAlignCigarSequentially(nonStandard, refSeq, getBases(), 0, 0); - } - - /** - * Make sure that the SW didn't fail in some terrible way, and throw exception if it did - */ - private boolean isSWFailure(final SmithWaterman alignment) { - // check that the alignment starts at the first base, which it should given the padding - if ( alignment.getAlignmentStart2wrt1() > 0 ) { - return true; -// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should always start at 0, but got " + alignment.getAlignmentStart2wrt1() + " with cigar " + alignment.getCigar()); - } - - // check that we aren't getting any S operators (which would be very bad downstream) - for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.S ) - return true; - // soft clips at the end of the alignment are really insertions -// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should never contain S operators but got cigar " + alignment.getCigar()); - } - - return false; - } - - /** - * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. - * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. - * @param cigar the cigar to left align - * @param refSeq the reference byte array - * @param readSeq the read byte array - * @param refIndex 0-based alignment start position on ref - * @param readIndex 0-based alignment start position on read - * @return the left-aligned cigar - */ - @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) - protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { - final Cigar cigarToReturn = new Cigar(); - Cigar cigarToAlign = new Cigar(); - for (int i = 0; i < cigar.numCigarElements(); i++) { - final CigarElement ce = cigar.getCigarElement(i); - if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { - cigarToAlign.add(ce); - final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false); - for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); } - refIndex += cigarToAlign.getReferenceLength(); - readIndex += cigarToAlign.getReadLength(); - cigarToAlign = new Cigar(); - } else { - cigarToAlign.add(ce); - } - } - if( !cigarToAlign.isEmpty() ) { - for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { - cigarToReturn.add(toAdd); - } - } - - final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn); - if( result.getReferenceLength() != cigar.getReferenceLength() ) - throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result); - return result; - } - - - /** - * Tests that this and other have the same score and vertices in the same order with the same seq - * @param other the other path to consider. Cannot be null - * @return true if this and path are equal, false otherwise - */ - public boolean equalScoreAndSequence(final Path other) { - if ( other == null ) throw new IllegalArgumentException("other cannot be null"); - return getScore() == other.getScore() && equalSequence(other); - } - - /** - * Tests that this and other have the same vertices in the same order with the same seq - * @param other the other path to consider. Cannot be null - * @return true if this and path are equal, false otherwise - */ - public boolean equalSequence(final Path other) { - final List mine = getVertices(); - final List yours = other.getVertices(); - if ( mine.size() == yours.size() ) { // hehehe - for ( int i = 0; i < mine.size(); i++ ) - if ( ! mine.get(i).seqEquals(yours.get(i)) ) - return false; - } - return true; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java deleted file mode 100644 index 1cf986c00..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java +++ /dev/null @@ -1,285 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - - -import java.util.List; -import java.util.ListIterator; - -/** - * Represents a route or path through a graph. - *

- * In contrast with a {@link Path}, a route keeps track of the - * path taken at furcations in order to speed up some path comparisions like the - * one implemented by {@link #isSuffix}. - *

- * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class Route extends Path { - - protected final Route previousRouteWithLastVertexThatIsForkOrJoin; - protected final boolean lastVertexIsForkOrJoin; - - /** - * Create a zero length route with a start in a particular vertex: - * - * @param initialVertex the first vertex of the route. - * @param graph the new route's graph. - * - * @throws IllegalArgumentException if {@code initialVertex} or {@code graph} are {@code null}. - * or if {@code initialVertex} does not belong to {@code graph}. - */ - public Route(final V initialVertex, final BaseGraph graph) { - super(initialVertex, graph); - previousRouteWithLastVertexThatIsForkOrJoin = null; - lastVertexIsForkOrJoin = graph.inDegreeOf(initialVertex) > 1; - } - - @Override - public boolean equals(final Object other) { - if (other == null) return false; - if (other == this) return true; - if (! (other instanceof Route)) return false; - @SuppressWarnings("unchecked") - final Route otherRoute = (Route) other; - return otherRoute.length() == this.length() && isSuffix(otherRoute); - } - - /** - * Extends a route into a new instance. - * - * @param prefix the route to extend. - * @param nextVertex the vertex to extend the route to. - * - * @throws IllegalArgumentException if {@code prefix} is {@code null} or {@code nextVertex} is {@code null} - * or {@code nextVertex} does not belong to {@code prefix}'s graph or there is no edge that in the graph - * that would connect {@code prefix}'s last vertex with {@code nextVertex} directly. - */ - public Route(final Route prefix, final V nextVertex) { - this(prefix,resolveSuffixEdge(prefix,nextVertex)); - } - - - /** - * Extends a route into a new instance. - * - * @param prevVertex the vertex to extend the route to. - * @param suffix the route to extend. - * - * @throws IllegalArgumentException if {@code suffix} is {@code null} or {@code prevVertex} is {@code null} - * or {@code prevVertex} does not belong to {@code suffix}'s graph or there is no edge that in the graph - * that would connect {@code suffix}'s first vertex with {@code prevVertex} directly. - */ - public Route(final V prevVertex, final Route suffix) { - this(resolvePrefixEdge(prevVertex, suffix),suffix); - } - - /** - * Resolves the prefix edge as required by {@link Route(V,Route)}. - */ - private static E resolvePrefixEdge(final V prevVertex, final Route suffix) { - if (prevVertex == null) throw new NullPointerException(); - if (!suffix.getGraph().containsVertex(prevVertex)) throw new IllegalArgumentException(); - final E result = suffix.getGraph().getEdge(prevVertex,suffix.getFirstVertex()); - if (result == null) - throw new IllegalArgumentException("there is no such edge in the graph"); - return result; - } - - /** - * Resolves the suffix edge as required by {@link Route(Route,V)} - */ - private static E resolveSuffixEdge(final Route prefix, final V nextVertex) { - if (nextVertex == null) throw new IllegalArgumentException(); - if (!prefix.getGraph().containsVertex(nextVertex)) throw new IllegalArgumentException(); - final E result = prefix.getGraph().getEdge(prefix.getLastVertex(),nextVertex); - if (result == null) - throw new IllegalArgumentException("there is no such edge in the graph"); - return result; - } - - /** - * Extends a route by prefixing an edge. - * - * @param initialEdge the extending edge. - * @param suffix the original path. - * - * @throws IllegalArgumentException if {@code suffix} or {@code initialEdge} are {@code null}, or {@code initialEdge} is - * not part of {@code suffix}'s graph, or {@code initialEdge} does not have as a target the first vertex in {@code suffix}. - */ - public Route(final E initialEdge, final Route suffix) { - super(initialEdge,suffix); - final V firstVertex = getFirstVertex(); - if(suffix.length() == 0) { - lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin || graph.outDegreeOf(firstVertex) > 1; - previousRouteWithLastVertexThatIsForkOrJoin = graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; - } else { - lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin; - if (suffix.previousRouteWithLastVertexThatIsForkOrJoin != null) - previousRouteWithLastVertexThatIsForkOrJoin = new Route<>(initialEdge,suffix.previousRouteWithLastVertexThatIsForkOrJoin); - else - previousRouteWithLastVertexThatIsForkOrJoin = graph.outDegreeOf(firstVertex) > 1 ? - new Route<>(new Route<>(firstVertex,graph),edgesInOrder.get(0)) : - graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; - } - } - - /** - * Create copy of an existing route. - * @param route the route to copy - * - * @throws NullPointerException if {@code route} is {@code null}. - */ - protected Route(final Route route) { - super(route); - lastVertexIsForkOrJoin = route.lastVertexIsForkOrJoin; - previousRouteWithLastVertexThatIsForkOrJoin = route.previousRouteWithLastVertexThatIsForkOrJoin; - } - - /** - * Create a new Route extending another one with an edge - * - * @param route the route to extend. - * @param edge the edge to extend the route with. - * - * @throws IllegalArgumentException if {@code route} or {@code edge} are {@code null}, or {@code edge} is - * not part of {@code route}'s graph, or {@code edge} does not have as a source the last vertex in {@code route}. - */ - public Route(final Route route, final E edge) { - super(route, edge); - lastVertexIsForkOrJoin = graph.outDegreeOf(route.lastVertex) > 1 || graph.inDegreeOf(lastVertex) > 1; - previousRouteWithLastVertexThatIsForkOrJoin = route.lastVertexIsForkOrJoin ? route : route.previousRouteWithLastVertexThatIsForkOrJoin; - } - - @Override - public boolean isSuffix(final Path other) { - if (other == this) - return true; - else if (other == null) - throw new IllegalArgumentException("other path must not be null"); - else if (getGraph() != other.getGraph()) - throw new IllegalArgumentException("other path must be part of the same graph"); - else if (other instanceof Route) - return isRouteSuffix((Route)other); - else - return super.isSuffix(other); - } - - @Override - public String toString() { - return super.toString().replace("Path{", "Route{"); - } - - /** - * Faster version when comparing with a route. - */ - protected boolean isRouteSuffix(final Route other) { - if (other.getGraph() != this.getGraph()) - throw new IllegalArgumentException("you cannot compare routes on different graphs"); - else if (lastVertex != other.lastVertex) // obvious case. - return false; - else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null - && other.previousRouteWithLastVertexThatIsForkOrJoin != null) // I am shorter or different path for sure. - return false; - else if (this.edgesInOrder.size() < other.edgesInOrder.size()) // I am shorter regardless of path, no way Jose! - return false; - else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null || other.previousRouteWithLastVertexThatIsForkOrJoin == null) { - final ListIterator myEdges = edgesInOrder.listIterator(edgesInOrder.size()); - final ListIterator otherEdges = other.edgesInOrder.listIterator(other.edgesInOrder.size()); - while (otherEdges.hasPrevious()) - if (myEdges.previous() != otherEdges.previous()) - return false; - return true; - } else - return (other.previousRouteWithLastVertexThatIsForkOrJoin == this.previousRouteWithLastVertexThatIsForkOrJoin) - || (previousRouteWithLastVertexThatIsForkOrJoin.lastVertex == other.previousRouteWithLastVertexThatIsForkOrJoin.lastVertex - && previousRouteWithLastVertexThatIsForkOrJoin.isRouteSuffix(other.previousRouteWithLastVertexThatIsForkOrJoin)); - } - - /** - * Checks whether the last vertex in the route is a fork or a joining vertex. - * @return {@code true} iff so. - */ - public boolean lastVertexIsForkOrJoin() { - return lastVertexIsForkOrJoin; - } - - /** - * Returns the longest prefix route that has as a last vertex a join or furcation vertex. - * - * @return never {@code null}. - */ - public Route getPrefixRouteWithLastVertexThatIsForkOrJoin() { - return previousRouteWithLastVertexThatIsForkOrJoin; - } - - - - /** - * Splice out the first few vertices of the route. - * - * @param length how many vertices to splice out - * @return a new route without those spliced vertices. - * - * @throws IllegalArgumentException if {@code length} is equal to the route's length or greater or if it is negative. - * Notice that non-vertex route are no legal routes. - */ - public Route splicePrefix(final int length) { - if (length == 0) - return this; - if (length >= length()) - throw new IllegalArgumentException("prefix slicing to long"); - if (length < 0) - throw new IllegalArgumentException("prefix cannot be negative"); - - final List resultEdges = getEdges().subList(length,length()); - Route result = new Route<>(graph.getEdgeSource(resultEdges.get(0)),this); - for (final E edge : resultEdges) - result = new Route<>(result,edge); - return result; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java deleted file mode 100644 index 55ff2f978..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java +++ /dev/null @@ -1,1016 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeRoute; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.SequenceComplexity; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.haplotype.Haplotype; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * - * Threading graph subclass used to "re-thread" haplotypes instead of reads. - * - * Created with IntelliJ IDEA. - * User: valentin - * Date: 8/23/13 - * Time: 2:42 PM - * To change this template use File | Settings | File Templates. - */ -public class HaplotypeGraph extends ReadThreadingGraph { - - /** - * Maximum repeat unit length considered when looking for repeats that should not be considered as - * possible read anchor places along the reference path. - */ - protected static final int DEFAULT_MAX_REPEAT_UNIT_LENGTH = 4; - - /** - * Minimum repeat length to consider a region a repeat that should not be considered as possibl read anchor - * places along the reference path. - */ - protected static final int DEFAULT_MIN_REPEAT_LENGTH_IN_UNITS = 6; - - /** - * Reference haplotype - */ - private Haplotype referenceHaplotype; - - /** - * Reference haplotype bases - */ - private byte[] referenceBases; - - /** - * Sets of haplotypes in the graph. - */ - private Set haplotypes; - - /** - * Route of haplotypes in the graph. - */ - private HaplotypeRoute referenceRoute; - - /** - * Set of vertices along the reference route. - */ - private Set referenceVertices; - - /** - * Holds haplotype routes by haplotype. - */ - private Map haplotypeRouteByHaplotype; - - /** - * Holds haplotypes by contained vertices. - */ - private Map> haplotypesByVertex; - - /** - * Reference to the logger for this class. - */ - private static final Logger logger = Logger.getLogger(HaplotypeGraph.class); - - /** - * What is the maximum STR unit length. - */ - private int maxRepeatUnitLength = DEFAULT_MAX_REPEAT_UNIT_LENGTH; - - /** - * What is the minimum length in units for a STR. - */ - private int minRepeatLengthInUnits = DEFAULT_MIN_REPEAT_LENGTH_IN_UNITS; - - - /** - * Indicates that the haplotype data structures need update previous to querying. - */ - private boolean needToUpdateHaplotypeStructures = true; - private Set anchorableVertices; - - /** - * Constructs a haplotype graph from a describing string. - * - *

Used for testing

- * @param string the string representation of the haplotype graph. - */ - public HaplotypeGraph(final String string) { - super(string); - haplotypes = new LinkedHashSet<>(10); - referenceVertices = Collections.emptySet(); - } - - /** - * Constructs a new haplotype graph given its kmerSize. - * - * @param kmerSize 1 or greater, the targeted kmerSize - * - * @throws IllegalArgumentException if {@code kmerSize} is 0 or negative. - */ - public HaplotypeGraph(final int kmerSize) { - super(kmerSize); - haplotypes = new LinkedHashSet<>(10); - referenceVertices = Collections.emptySet(); - } - - - /** - * Set of vertices along the reference haplotype path. - * - * @return never {@code} null but perhaps empty. - */ - public Set getReferenceVertices() { - updateHaplotypeStructures(); - return referenceVertices; - } - - /** - * Returns the haplotype route given an haplotype. - * @param haplotype query haplotype - * @throws NullPointerException if {@code haplotype} is {@code null}. - * @throws IllegalArgumentException if {@code haplotype} is not a supported haplotype in the graph. - * @return never {@code null}. - */ - public HaplotypeRoute getHaplotypeRoute(final Haplotype haplotype) { - updateHaplotypeStructures(); - if (!haplotypes.contains(haplotype)) - throw new IllegalArgumentException("input haplotype must be part of the haplotype graph haplotype set"); - HaplotypeRoute result = haplotypeRouteByHaplotype.get(haplotype); - if (result == null) - haplotypeRouteByHaplotype.put(haplotype,result = buildHaplotypeRoute(haplotype)); - return result; - } - - /** - * Creates an haplotype route. - * @param haplotype the target haplotype - * @return {@code null} if there is no such a route in the graph. - */ - private HaplotypeRoute buildHaplotypeRoute(final Haplotype haplotype) { - final Route route = RouteFinder.findRoute(this,haplotype.getBases()); - if (route == null) - return null; - else - return new HaplotypeRoute(route); - } - - /** - * Bases along the reference path. - * - * @return {@code null} if there is no reference. - */ - @SuppressWarnings("unused") - public byte[] getReferenceBases() { - updateHaplotypeStructures(); - return referenceBases; - } - - /** - * Returns the reference haplotype - * @return {@code null} if there is no such a reference. - */ - public Haplotype getReferenceHaplotype() { - updateHaplotypeStructures(); - return referenceHaplotype; - } - - - - /** - * Construct a haplotype graph given the haplotype list and the elected kmerSize. - * - * @param haplotypes whose path to add to the graph. - * @param kmerSize the kmerSize use to compose the graph. - */ - public HaplotypeGraph(final int kmerSize, final List haplotypes) { - super(kmerSize); - referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); - this.haplotypes = new LinkedHashSet<>(haplotypes); - addSequence("anonymous", referenceHaplotype.getBases(), null, true); - for (final Haplotype h : haplotypes) { - if (h.isReference()) - continue; - if (h.length() < kmerSize) { - Utils.warnUser(logger, "haplotype shorter than kmerSize " + h.length() + " < " + kmerSize + " will be dropped"); - } else - addSequence("anonymous", h.getBases(), null, false); - - } - buildGraphIfNecessary(); - } - - /** - * Returns the reference haplotype within the input collection. - * - * @param haplotypes the query haplotype set. - * @throws IllegalArgumentException if there is no reference haplotype. - * @throws NullPointerException if {@code haplotypes} is {@code null} or contains some {@code null} value. - * @return never {@code} null, a haplotype that is reference. - */ - private Haplotype findReferenceHaplotypeOrFail(final List haplotypes) { - for (final Haplotype h : haplotypes) - if (h.isReference()) - return h; - throw new IllegalArgumentException("no reference haplotype present"); - } - - /** - * Constructs a new haplotype graph given a template read-threading graph and set of haplotypes - * - * @param template the template read-threading graph. - * @param haplotypes the haplotype set to consider - */ - public HaplotypeGraph(final ReadThreadingGraph template, final List haplotypes) { - this(template.getKmerSize()); - referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); - this.haplotypes = new HashSet<>(haplotypes); - template.buildGraphIfNecessary(); - uniqueKmers = new HashMap<>(); - nonUniqueKmers = new HashSet<>(); - // Copy vertices over. - addVertices(template.vertexSet()); - // Copy edges over. - for (final MultiSampleEdge edge : template.edgeSet()) { - final MultiSampleEdge newEdge = addEdge(template.getEdgeSource(edge), template.getEdgeTarget(edge)); - newEdge.setIsRef(newEdge.isRef()); - newEdge.setMultiplicity(edge.getMultiplicity()); - } - // Copy kmer lookup tables: - uniqueKmers.putAll(template.uniqueKmers); - nonUniqueKmers.addAll(template.nonUniqueKmers); - alreadyBuilt = true; - } - - /** - * Update the haplotype data structures based in current edges and vertices. - */ - private void updateHaplotypeStructures() { - if (!needToUpdateHaplotypeStructures) - return; - needToUpdateHaplotypeStructures = false; - haplotypeRouteByHaplotype = new LinkedHashMap<>(haplotypes.size()); - final Iterator haplotypeIterator = haplotypes.iterator(); - final Set nonFoundHaplotypes = new HashSet<>(haplotypes.size()); - while (haplotypeIterator.hasNext()) { - final Haplotype haplotype = haplotypeIterator.next(); - final HaplotypeRoute haplotypeRoute = buildHaplotypeRoute(haplotype); - if (haplotypeRoute == null) { - haplotypeIterator.remove(); - nonFoundHaplotypes.add(haplotype); - if (haplotype.isReference()) { - referenceHaplotype = null; - referenceRoute = null; - referenceVertices = Collections.emptySet(); - referenceBases = null; - } - } else { - if (haplotype.isReference()) { - referenceHaplotype = haplotype; - referenceRoute = haplotypeRoute; - referenceVertices = haplotypeRoute.vertexSet(); - referenceBases = haplotypeRoute.getBases(); - } - haplotypeRouteByHaplotype.put(haplotype, haplotypeRoute); - } - } - haplotypesByVertex = buildHaplotypesByVertex(); - anchorableVertices = calculateAnchorableVertexSet(); - logger.debug("some haplotypes do not have a path across the haplotype graph " + nonFoundHaplotypes.size()); - } - - /** - * Builds a map for each vertex to all the haplotype routes that pass thru it. - */ - private Map> buildHaplotypesByVertex() { - final Map> result = new HashMap<>(referenceVertices.size()); - final Set allHaplotypeRoutes = new LinkedHashSet<>(haplotypeRouteByHaplotype.values()); - for (final HaplotypeRoute haplotypeRoute : allHaplotypeRoutes) { - final Set singleton = Collections.singleton(haplotypeRoute); - for (final MultiDeBruijnVertex vertex : haplotypeRoute.vertexSet()) - if (!result.containsKey(vertex)) - result.put(vertex, singleton); - else { - final Set currentHrs = result.get(vertex); - if (currentHrs.size() == haplotypes.size() - 1) - result.put(vertex, allHaplotypeRoutes); - else if (currentHrs.size() == 1) { - final Set newHrs = new LinkedHashSet<>(allHaplotypeRoutes.size()); - newHrs.addAll(currentHrs); - newHrs.add(haplotypeRoute); - result.put(vertex, newHrs); - } else - currentHrs.add(haplotypeRoute); - } - } - return result; - } - - - /** - * Debug convenient method to print a graph into a file in the .dot format. - * @param fileName name of the output file. - * @throws NullPointerException if {@code fileName} is {@code null}. - */ - public void printGraph(final String fileName) { - super.printGraph(new File(fileName), 10000); - } - - - - - @Override - public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) { - if ( writeHeader ) - graphWriter.println("digraph assemblyGraphs {"); - - - for( final MultiSampleEdge edge : edgeSet() ) { - graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];"); - if( edge.isRef() ) { - graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); - } - } - - for( final MultiDeBruijnVertex v : vertexSet() ) - graphWriter.println("\t" + v.toString() + " [label=\"" + v.getId() + ":" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]"); - - if ( writeHeader ) - graphWriter.println("}"); - } - - - @Override - public Pair findStart(final SequenceForKmers seqForKmers) { - return getOrCreateKmerVertex(seqForKmers.sequence, 0, true); - } - - /** - * Checks whether the graph has some sources or sink vertices that are not reference vertices. - * - * @return {@code true} iff so. - */ - public boolean hasNonReferenceEnds() { - for (final MultiDeBruijnVertex end : getSources()) - if (!isReferenceNode(end)) return true; - for (final MultiDeBruijnVertex end : getSinks()) - if (!isReferenceNode(end)) return true; - return false; - } - - /** - * Merges vertices that share exactly the same set of outgoing vertices. - *

- * This is done in reversed topological order and since the graph is a DAG it ensure to return a graph - * that such merge is any longer possible. I.e. there is no need to run this method more than once. - *

- * Notice that we will a record of distinct unique kmers that map to the same vertex that map now to the same - * merged vertex. Thus if vertices {@code X and Y} are merged then {@code findKmer(X.sequence) == findKmer(Y.sequence)}. - *

- * Examples: - *

    - *
  • - * {@code AAA -> AAC, CAA -> AAC} would become {@code NAA -> AAC}. - *
  • - * {@code AAA -> AAC, AAA -> AAG, CAA -> AAC, CAA -> AAG} would become {@code NAA -> AAG, NAA -> AAG} - *
  • - * {@code AAA -> AAC, AAA -> AAG, CAA -> AAC} would not change as {@code AAA} and {@code CAA} - * do not share {@code AAG} as outgoing vertex. - *
  • - *
  • - * {@code AAA -> AAC, AAC -> ACA, CAA -> AAC, GAC -> ACA } would become {@code NAA -> NAC, NAC -> ACA}. - *
  • - *
- */ - public void mergeCommonChains() { - final int vertexCount = vertexSet().size(); - final Set refVertices = new HashSet<>(vertexCount); - final Map indexByVertex = new HashMap<>(vertexCount); - final int[] pendingChildren = new int[vertexCount]; - final Deque readyVertices = new LinkedList<>(); - final Set merged = new HashSet<>(1 + vertexCount / 10 ); - - // Initialize traversal data structures. - mergeCommonChainsInitialize(refVertices, indexByVertex, pendingChildren, readyVertices); - - // Traversal in inverted topological order where children nodes are processed before their parents. - while (!readyVertices.isEmpty()) { - final MultiDeBruijnVertex currentVertex = readyVertices.remove(); - if (merged.contains(currentVertex)) continue; - - final Set mergeSet = new HashSet<>(2); - MultiDeBruijnVertex refVertex = mergeCommonChainsComposeMergeSet(refVertices, currentVertex, mergeSet); - mergeVertices(refVertex,mergeSet,indexByVertex,pendingChildren,readyVertices); - merged.addAll(mergeSet); - } - needToUpdateHaplotypeStructures = true; - } - - /** - * Given a seed vertex, determines the mergin set of nodes that will be collapsed into one. - * - * @param refVertices reference path vertices - * @param currentVertex current vertex. - * @param mergeSet where to store the final merging set. - * @return the reference node if present that needs to be preserved as such. It might be {@code null} - */ - private MultiDeBruijnVertex mergeCommonChainsComposeMergeSet(final Set refVertices, - final MultiDeBruijnVertex currentVertex, - final Set mergeSet) { - final boolean currentIsSource = isSource(currentVertex); - final Set children = outgoingVerticesOf(currentVertex); - if (children.size() == 0) - mergeSet.add(currentVertex); - else - for (final MultiDeBruijnVertex child : children) - mergeSet.addAll(incomingVerticesOf(child)); - - MultiDeBruijnVertex refVertex = refVertices.contains(currentVertex) ? currentVertex : null; - final Iterator candidatesIt = mergeSet.iterator(); - while (candidatesIt.hasNext()) { - final MultiDeBruijnVertex candidate = candidatesIt.next(); - if (candidate == currentVertex) continue; - if (isSource(candidate) != currentIsSource) { - candidatesIt.remove(); - continue; - } - if (currentIsSource && !candidate.getSequenceString().equals(currentVertex.getSequenceString())) { - candidatesIt.remove(); - continue; - } - if (!currentIsSource && candidate.getSuffix() != currentVertex.getSuffix()) { - candidatesIt.remove(); - continue; - } - final Set candidateChildren = outgoingVerticesOf(candidate); - if (candidateChildren.size() != children.size()) - candidatesIt.remove(); - else { - boolean removed = false; - for (final MultiDeBruijnVertex candidateChild : candidateChildren) - if (!children.contains(candidateChild)) { - candidatesIt.remove(); - removed = true; - break; - } - if (refVertex == null && !removed && refVertices.contains(candidate)) refVertex = candidate; - } - } - return refVertex; - } - - /** - * Initialize data-structures for {@link #mergeCommonChains} - * - * @param refVertices will contain reference path vertices. - * @param indexByVertex map vertex -> index in {@code pendingChildren}. - * @param pendingChildren number of children of a node that have not yet been processed. - * @param readyVertices vertices that are ready to be processed (all children have been processed). - */ - private void mergeCommonChainsInitialize(final Set refVertices, - final Map indexByVertex, - final int[] pendingChildren, - final Deque readyVertices) { - int nextIndex = 0; - for (final MultiDeBruijnVertex v : vertexSet()) { - indexByVertex.put(v,nextIndex++); - if (isReferenceNode(v)) refVertices.add(v); - } - - for (final Map.Entry entry : indexByVertex.entrySet()) - if ((pendingChildren[entry.getValue()] = outDegreeOf(entry.getKey())) == 0) - readyVertices.add(entry.getKey()); - } - - // Perform the actual merge. - private void mergeVertices(final MultiDeBruijnVertex refVertex, final Collection vertices, final Map indexByVertex, final int[] pendingChildrenCounts, final Deque ready) { - if (vertices.size() == 0) - throw new IllegalArgumentException(); - final MultiDeBruijnVertex vertexToKeep = refVertex == null ? vertices.iterator().next() : refVertex; - final byte[] sequence = vertexToKeep.getSequence(); - final Set uniqueKmersToUpdate = new HashSet<>(vertices.size()); - final Set parentVertices = new HashSet<>(inDegreeOf(vertexToKeep) * 2); - parentVertices.addAll(incomingVerticesOf(vertexToKeep)); - for (final MultiDeBruijnVertex p : parentVertices) - if (--pendingChildrenCounts[indexByVertex.get(p)] == 0) - ready.add(p); - - final Kmer mergedKmer = new Kmer(sequence); - if (uniqueKmers.containsKey(mergedKmer)) { - uniqueKmersToUpdate.add(new Kmer(mergedKmer.bases().clone())); - uniqueKmers.remove(mergedKmer); - } - boolean foundMergedVertex = false; - for (final MultiDeBruijnVertex v : vertices) - if (v == vertexToKeep) - foundMergedVertex = true; - else { - final byte[] seq = v.getSequence(); - final Kmer kmer = new Kmer(seq); - if (uniqueKmers.containsKey(kmer)) { - uniqueKmersToUpdate.add(kmer); - uniqueKmers.remove(kmer); - } - if (sequence.length != seq.length) throw new IllegalArgumentException("mismatched sizes " + sequence.length + " != " - + seq.length + " " + new String(sequence) + " " + new String(seq)); - for (int i = sequence.length - 1; i >= 0; i--) { - - if (sequence[i] != seq[i]) sequence[i] = 'N'; - } - for (final MultiDeBruijnVertex p : incomingVerticesOf(v)) { - if (--pendingChildrenCounts[indexByVertex.get(p)] == 0) - ready.add(p); - if (!parentVertices.contains(p)) { - parentVertices.add(p); - final MultiSampleEdge e = getEdge(p,v); - addEdge(p,vertexToKeep,new MultiSampleEdge(e.isRef(),e.getMultiplicity(),1)); - } else { - getEdge(p,vertexToKeep).incMultiplicity(getEdge(p,v).getMultiplicity()); - } - } - removeVertex(v); - } - if (!foundMergedVertex) - throw new IllegalArgumentException("merged vertex must be contained in the input set"); - for (final Kmer kmer : uniqueKmersToUpdate) - uniqueKmers.put(kmer,vertexToKeep); - } - - public Map uniqueKmerMap() { - return Collections.unmodifiableMap(uniqueKmers); - } - - @Override - public boolean equals(Object other) { - return (other instanceof HaplotypeGraph) && equals((HaplotypeGraph)other); - } - - - /** - * Simple debug representation of the haplotype graph. - * @return never {@code null} - */ - @Override - public String toString() { - return getClass().getSimpleName() + "[ks=" + kmerSize + "](vs=" + vertexSet().size() + "," + edgeSet().size() + "){...}"; - } - - /** - * Returns set of valid haplotypes. - * @return never {@code null} but perhaps empty. - */ - public Set getHaplotypes() { - updateHaplotypeStructures(); - return haplotypes; - } - - /** - * Returns a map between valid haplotypes and corresponding routes in the graph. - * @return never {@code null} but perhaps empty. - */ - public Map getHaplotypeRouteMap() { - updateHaplotypeStructures(); - return haplotypeRouteByHaplotype; - } - - /** - * Returns set of haplotype routes that enclose a vertex. - * @param vertex the query vertex. - * @return never {@code null} but perhaps empty set. - */ - public Set getEnclosingHaplotypeRoutes(final MultiDeBruijnVertex vertex) { - updateHaplotypeStructures(); - if (haplotypesByVertex == null) - return Collections.emptySet(); - final Set result = haplotypesByVertex.get(vertex); - if (result == null) - return Collections.emptySet(); - else - return result; - } - - /** - * Returns the reference route - * - * @return {@code null} if there is no valid reference haplotype. - */ - public HaplotypeRoute getReferenceRoute() { - updateHaplotypeStructures(); - return referenceRoute; - } - - /*********************************************** - * deep equals implementation, used in testing. * - ***********************************************/ - - /** - * Compare two haplotype threading graphs and it determines whether they have the same structure. - *

- * This method goes a long way to figure out the equality and no equality of both graphs. However there - * are "pathological" case in where it might fail to see a difference. This is due to the fact that there - * is no guarantee of the uniqueness of sequences at source vertex. - *

- * If there are more than one source vertex with the same sequence it try to match source vertices between both - * graphs matching all possible paths emanating from every pair of sources. - * - *

Note: in practice this is only used in for testing purposes - * - * @param other the other graph to compare against. - * @return never {@code null}. - */ - public boolean equals(HaplotypeGraph other) { - updateHaplotypeStructures(); - if (other == null) return false; - if (other == this) return true; - - if (!equals$ReferencePaths(this, other)) return false; - final Map thisSourcesBySequence = equalsBuildSourceBySequenceMap(this); - final Map otherSourcesBySequence = equalsBuildSourceBySequenceMap(other); - if (thisSourcesBySequence.size() != otherSourcesBySequence.size()) return false; - final List unmatchedLeft = new LinkedList<>(); - final List unmatchedRight = new LinkedList<>(); - final List> sourcePairs = equals$matchVertexBySequenceMaps(thisSourcesBySequence,otherSourcesBySequence,unmatchedLeft,unmatchedRight); - if (unmatchedLeft.size() > 0 || unmatchedRight.size() > 0) return false; - - - final Deque> pending = new LinkedList<>(sourcePairs); - final Set visited = new HashSet<>(vertexSet().size()); - while (!pending.isEmpty()) { - final Pair pair = pending.removeFirst(); - final MultiDeBruijnVertex leftVertex = pair.getFirst(); - final MultiDeBruijnVertex rightVertex = pair.getSecond(); - final List> childrenPairs = equals$matchVertexBySequenceMaps(equalsBuildChildrenBySuffixMap(this, leftVertex), - equalsBuildChildrenBySuffixMap(other, rightVertex), unmatchedLeft, unmatchedRight); - if (unmatchedLeft.size() > 0 || unmatchedRight.size() > 0) return false; - for (final Pair childPair : childrenPairs) { - final MultiDeBruijnVertex leftChild = childPair.getFirst(); - final MultiDeBruijnVertex rightChild = childPair.getSecond(); - final boolean leftVisited = visited.add(leftChild); - final boolean rightVisited = visited.add(rightChild); - if (leftVisited != rightVisited) return false; // visited before in different matchings. - if (leftVisited) continue; - pending.add(childPair); - visited.add(childPair.getFirst()); - visited.add(childPair.getSecond()); - } - } - return true; - } - - // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. - private boolean equals$ReferencePaths(final HaplotypeGraph g1, final HaplotypeGraph g2) { - MultiDeBruijnVertex refVertex1 = g1.getReferenceSourceVertex(); - MultiDeBruijnVertex refVertex2 = g2.getReferenceSourceVertex(); - if (refVertex1 == null && refVertex2 == null) - return true; - if (refVertex1 == null || refVertex2 == null) - return false; - - if (!refVertex1.getSequenceString().equals(refVertex2.getSequenceString())) - return false; - - while (refVertex1 != null && refVertex2 != null) { - if (refVertex1.getSuffix() != refVertex2.getSuffix()) return false; - refVertex1 = g1.getNextReferenceVertex(refVertex1); - refVertex2 = g2.getNextReferenceVertex(refVertex2); - - } - return refVertex1 == refVertex2; - } - - // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. - private static Map equalsBuildChildrenBySuffixMap(final HaplotypeGraph graph, - final MultiDeBruijnVertex vertex) { - final Map result = new HashMap<>(); - for (final MultiDeBruijnVertex child : graph.outgoingVerticesOf(vertex)) - result.put(new String(new byte[]{child.getSuffix()}), child); - return result; - } - - // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. - private static List> equals$matchVertexBySequenceMaps( - final Map left, final Map right, - final Collection unmatchedLeft, final Collection unmatchedRight) { - final List> result = new LinkedList<>(); - for (final Map.Entry leftEntry : left.entrySet()) - if (right.containsKey(leftEntry.getKey())) - result.add(new Pair<>(leftEntry.getValue(),right.get(leftEntry.getKey()))); - else - unmatchedLeft.add(leftEntry.getValue()); - for (final Map.Entry rightEntry : right.entrySet()) - if (!left.containsKey(rightEntry.getKey())) - unmatchedRight.add(rightEntry.getValue()); - return result; - } - - // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. - private static Map equalsBuildSourceBySequenceMap(final HaplotypeGraph other) { - - final Set sources = other.getSources(); - final Map result = new HashMap<>(sources.size()); - final Map> collisions = new HashMap<>(sources.size()); - for (final MultiDeBruijnVertex v : sources) { - final String sequence = v.getSequenceString(); - if (result.containsKey(sequence)) { // we need to handle collision due to lack of uniqueness. - final List collisionList; - if (collisions.containsKey(sequence)) - collisionList = collisions.get(sequence); - else - collisions.put(sequence,collisionList = new LinkedList<>()); - collisionList.add(v); - } else { - result.put(sequence,v); - } - } - if (collisions.size() == 0) - return result; - for (final String s : collisions.keySet()) { - result.remove(s); - final List vertices = collisions.remove(s); - int number = 0; - final List> extendedSequences = new LinkedList<>(); - for (final MultiDeBruijnVertex vertice : vertices) - extendedSequences.add(new Pair<>(vertice, equalsCollisionResolverExtendedSequence(other, vertice))); - Collections.sort(extendedSequences,new Comparator>(){ - public int compare(final Pair p1, final Pair p2) { - return p1.getSecond().compareTo(p2.getSecond()); - } - }); - for (final Pair p : extendedSequences) - result.put(p.getSecond() + '-' + (number++),p.getFirst()); - } - return result; - - } - - // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. - private static String equalsCollisionResolverExtendedSequence(final HaplotypeGraph graph, final MultiDeBruijnVertex source) { - final StringBuilder buffer = new StringBuilder(1000); - final Set visited = new HashSet<>(graph.vertexSet().size()); - final Stack pending = new Stack<>(); - final Stack position = new Stack<>(); - position.ensureCapacity(graph.vertexSet().size()); - pending.ensureCapacity(graph.vertexSet().size()); - pending.add(source); - position.add(0); - int lastPos = -1; - while (!pending.isEmpty()) { - final MultiDeBruijnVertex next = pending.pop(); - if (visited.contains(next)) continue; - visited.add(next); - final int pos = position.pop(); - final CharSequence sequence; - if (graph.isSource(next)) { - if (next == source) { - sequence = new String(next.getSequence()); - } else { - sequence = new StringBuffer(next.getSequence().length).append(new String(next.getSequence())).reverse().append('$'); - } - } else { - sequence = new String(new byte[] { next.getSuffix()}); - } - - if (pos != lastPos + 1) { - buffer.append('[').append(Math.abs(pos)).append(']'); - } - buffer.append(sequence); - lastPos = pos + sequence.length() - 1; - - final List parents = new LinkedList<>(graph.incomingVerticesOf(next)); - Collections.sort(parents,new Comparator() { - @Override - public int compare(final MultiDeBruijnVertex o1, final MultiDeBruijnVertex o2) { - return Byte.compare(o1.getSuffix(),o2.getSuffix()); - } - }); - for (final MultiDeBruijnVertex parent : parents) { - pending.push(parent); - position.push(lastPos + 1); - } - - final List children = new LinkedList<>(graph.incomingVerticesOf(next)); - Collections.sort(children,new Comparator() { - @Override - public int compare(final MultiDeBruijnVertex o1, final MultiDeBruijnVertex o2) { - return Byte.compare(o1.getSuffix(),o2.getSuffix()); - } - }); - for (final MultiDeBruijnVertex child : graph.outgoingVerticesOf(next)) { - pending.push(child); - position.push(lastPos + 1); - } - } - - return buffer.toString(); - } - - - /** - * Calculates the subset of reference path vertices that are amenable to be anchoring vertices. - *

- *

- * For a vertex to be anchorable: - *

    - *
  • Should not include bases from a repeat
  • , - *
  • There should not be in a middle of a event block
  • - *
- *

- * - * @return never {@code null}. - */ - private Set calculateAnchorableVertexSet() { - updateHaplotypeStructures(); - if (referenceBases == null) - return Collections.emptySet(); - - // We first check what bases in the reference path bases are part of a repeat. - final boolean[] nonAnchorableDueToRepeats = SequenceComplexity.findBasesInShortUnitRepeats( - referenceBases, maxRepeatUnitLength, minRepeatLengthInUnits); - - final Set result = new HashSet<>(100); - final Map expectedRejoins = new HashMap<>(); - - - MultiDeBruijnVertex currentVertex = getReferenceRoute().getFirstVertex(); - final int sourceSequenceLength = currentVertex.getSequence().length; - - // Determine whether the reference source vertex in anchorable discarding repeats: - boolean sourceIsAnchorable = true; - for (int i = 0; i < sourceSequenceLength; i++) - if (nonAnchorableDueToRepeats[i]) { - sourceIsAnchorable = false; - break; - } - - // Update the nonAnchorableDueToRepeats array accordingly. - int index = currentVertex.getSequence().length - 1; - nonAnchorableDueToRepeats[index] = !sourceIsAnchorable; - - - // We keep record on all alternative path lengths: - final CountSet pathLengths = new CountSet(haplotypes.size()); - pathLengths.setTo(0); - - // Now we go through the reference path and determine which vertices are not part of event block. - // We keep track of open divergent paths in expectedRejoins. Thus only those vertices traversed - // when exptectedRejoins size 0 can be anchorable: - while (currentVertex != null) { - int inDegree = inDegreeOf(currentVertex); - if (inDegree > 1) - expectedRejoins.remove(currentVertex); - if (expectedRejoins.size() == 0 && !nonAnchorableDueToRepeats[index]) { - currentVertex.setAdditionalInfo(currentVertex.additionalInfo() + "*"); - result.add(currentVertex); - } - final Set nextEdges = outgoingEdgesOf(currentVertex); - MultiDeBruijnVertex nextReferenceVertex = null; - for (final MultiSampleEdge e : nextEdges) { - final MultiDeBruijnVertex nextVertex = getEdgeTarget(e); - if (e.isRef() && referenceVertices.contains(nextVertex)) - nextReferenceVertex = nextVertex; - else - calculateRejoins(nextVertex, expectedRejoins, referenceVertices, pathLengths, false, false); - } - currentVertex = nextReferenceVertex; - index++; - } - return result; - } - - - - /** - * Returns those vertices that can be used as anchors along the refererence route. - * @return never {@code null} but perhaps empty if there is no such a vertex. - */ - public Set getAnchorableVertices() { - updateHaplotypeStructures(); - return anchorableVertices; - } - - /** - * Finds non-reference wondering paths that will rejoin the reference path from a particular node. - *

- *

- * It only considers those paths that rejoin within the anchor points of a read. - *

- *

- *

- * Rather than reporting explicitly the path vertice sequence, this method report the length of the paths - * found. These are dumped into {@code expectedRejoins} where the keys are refernce path vertex where paths rejoin - * and the value is the set of path lengths. - *

- *

- *

The path lengths are calculated as the length from the startVertex plus the prefix sizes {@code prefixSizes}

- *

- *

You can also ask the method to exhaustively find all paths ({@code exhaustive == true}) or just consider - * intermediate nodes once ({@code exhustive == false}). If the latter only the shortest paths are considered.

- *

- *

Finally you also can check on paths backwards ({@code backwards == true}) or forwards ({@code backwards == false})

- * - * @param startVertex the origin node for those paths. - * @param expectedRejoins map where to place the found paths in a form of the rejoining non-reference vertex (key) and - * set of path lengths (value). - * @param referenceWithinBoundaries reference vertices found between read anchors. The key are the vertices, the values are - * the kmer's offset in the read. - * @param prefixSizes prefix path sizes to be added to the rejoin path sizes. - * @param exhaustive whether all paths should be considered or we only care about find out the rejoining vertices. - * @param backwards whether we want to find backward paths (inverse edge traversal). - * - * Note: it is marked as deprecated as this method signature may change in the future. It is public just because - * is currently shared by several other classes, however it would not be surprising if - * it gets refactored out at some point. So use with care. - */ - @Deprecated - public void calculateRejoins(final MultiDeBruijnVertex startVertex, final Map expectedRejoins, - final Set referenceWithinBoundaries, final CountSet prefixSizes, - final boolean exhaustive, final boolean backwards) { - Queue queue = new LinkedList<>(); - Queue depths = new LinkedList<>(); - queue.add(startVertex); - depths.add(prefixSizes); - - final Set visited = new HashSet<>(); - if (!exhaustive) visited.add(startVertex); - while (!queue.isEmpty()) { - final CountSet depth = depths.remove(); - final MultiDeBruijnVertex v = queue.remove(); - if (referenceVertices.contains(v)) { - if (referenceWithinBoundaries.contains(v)) { - final CountSet previous = expectedRejoins.get(v); - if (previous == null) - expectedRejoins.put(v, depth.clone()); - else - previous.addAll(depth); - } - } else { - final CountSet depthPlusOne = depth.clone(); - depthPlusOne.incAll(1); - final Set nextEdges = backwards ? incomingEdgesOf(v) : outgoingEdgesOf(v); - for (final MultiSampleEdge e : nextEdges) { - final MultiDeBruijnVertex w = backwards ? getEdgeSource(e) : getEdgeTarget(e); - if (visited.contains(w)) // avoid repetitive work. - continue; - if (!exhaustive) visited.add(w); - queue.add(w); - depths.add(depthPlusOne); - } - } - } - } - -} - - diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java deleted file mode 100644 index f33a4883f..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ /dev/null @@ -1,235 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.File; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public class ReadThreadingAssembler extends LocalAssemblyEngine { - private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class); - - private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128; - private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000; - private final static int KMER_SIZE_ITERATION_INCREASE = 10; - private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6; - - /** The min and max kmer sizes to try when building the graph. */ - private final List kmerSizes; - private final int maxAllowedPathsForReadThreadingAssembler; - - private final boolean dontIncreaseKmerSizesForCycles; - private final int numPruningSamples; - private boolean requireReasonableNumberOfPaths = false; - protected boolean removePathsNotConnectedToRef = true; - private boolean justReturnRawGraph = false; - - /** for testing only */ - public ReadThreadingAssembler() { - this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25)); - } - - public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes, final boolean dontIncreaseKmerSizesForCycles, final int numPruningSamples) { - super(maxAllowedPathsForReadThreadingAssembler); - this.kmerSizes = kmerSizes; - this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler; - this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles; - this.numPruningSamples = numPruningSamples; - } - - public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { - this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true, 1); - } - - /** for testing purposes */ - protected void setJustReturnRawGraph(boolean justReturnRawGraph) { - this.justReturnRawGraph = justReturnRawGraph; - } - - private void addResult(final List results, final AssemblyResult maybeNullResult) { - if ( maybeNullResult != null ) - results.add(maybeNullResult); - } - - @Override - public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { - final List results = new LinkedList<>(); - - // first, try using the requested kmer sizes - for ( final int kmerSize : kmerSizes ) { - addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles)); - } - - // if none of those worked, iterate over larger sizes if allowed to do so - if ( results.isEmpty() && !dontIncreaseKmerSizesForCycles ) { - int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; - int numIterations = 1; - while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { - // on the last attempt we will allow low complexity graphs - addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT)); - kmerSize += KMER_SIZE_ITERATION_INCREASE; - numIterations++; - } - } - - return results; - } - - /** - * Creates the sequence graph for the given kmerSize - * - * @param reads reads to use - * @param refHaplotype reference haplotype - * @param kmerSize kmer size - * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph - * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs - * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity) - */ - protected AssemblyResult createGraph(final List reads, - final Haplotype refHaplotype, - final int kmerSize, - final List activeAlleleHaplotypes, - final boolean allowLowComplexityGraphs) { - if ( refHaplotype.length() < kmerSize ) { - // happens in cases where the assembled region is just too small - return new AssemblyResult(AssemblyResult.Status.FAILED, null); - } - - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples); - - // add the reference sequence to the graph - rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); - - // add the artificial GGA haplotypes to the graph - int hapCount = 0; - for ( final Haplotype h : activeAlleleHaplotypes ) { - final int[] counts = new int[h.length()]; - Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); - rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); - } - - // Next pull kmers out of every read and throw them on the graph - for( final GATKSAMRecord read : reads ) { - rtgraph.addRead(read); - } - - // actually build the read threading graph - rtgraph.buildGraphIfNecessary(); - - // sanity check: make sure there are no cycles in the graph - if ( rtgraph.hasCycles() ) { - if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle"); - return null; - } - - // sanity check: make sure the graph had enough complexity with the given kmer - if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) { - if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity"); - return null; - } - - printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot")); - - // go through and prune all of the chains where all edges have <= pruneFactor. This must occur - // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering - // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 - rtgraph.pruneLowWeightChains(pruneFactor); - - // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if - // we can recover them by merging some N bases from the chain back into the reference - if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor); - - // remove all heading and trailing paths - if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - - printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot")); - - final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); - if (debugGraphTransformations) initialSeqGraph.printGraph(new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"),10000); - - // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph - if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph); - - if (debug) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - printDebugGraphTransform(initialSeqGraph, new File( "" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot")); - initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction - - final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); - final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus(); - final AssemblyResult result = new AssemblyResult(status, cleaned.getGraph()); - result.setThreadingGraph(rtgraph); - return result; - } - - /** - * Did we find a reasonable number of paths in this graph? - * @param graph - * @return - */ - private boolean reasonableNumberOfPaths(final SeqGraph graph) { - final KBestPaths pathFinder = new KBestPaths<>(false); - final List> allPaths = pathFinder.getKBestPaths(graph, 100000); - logger.info("Found " + allPaths.size() + " paths through " + graph + " with maximum " + maxAllowedPathsForReadThreadingAssembler); - return allPaths.size() <= maxAllowedPathsForReadThreadingAssembler; - } - - @Override - public String toString() { - return "ReadThreadingAssembler{" + - "kmerSizes=" + kmerSizes + - '}'; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java deleted file mode 100644 index dc057294e..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ /dev/null @@ -1,1045 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; -import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; -import org.jgrapht.EdgeFactory; -import org.jgrapht.alg.CycleDetector; - -import java.io.File; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class ReadThreadingGraph extends BaseGraph implements KmerSearchableGraph { - - /** - * Edge factory that encapsulates the numPruningSamples assembly parameter - */ - private static class MyEdgeFactory implements EdgeFactory { - final int numPruningSamples; - - public MyEdgeFactory(int numPruningSamples) { - this.numPruningSamples = numPruningSamples; - } - - @Override - public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) { - return new MultiSampleEdge(false, 1, numPruningSamples); - } - - public MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) { - return new MultiSampleEdge(isRef, multiplicity, numPruningSamples); - } - - } - - private final static Logger logger = Logger.getLogger(ReadThreadingGraph.class); - - private final static String ANONYMOUS_SAMPLE = "XXX_UNNAMED_XXX"; - private final static boolean WRITE_GRAPH = false; - private final static boolean DEBUG_NON_UNIQUE_CALC = false; - - private final static int MAX_CIGAR_COMPLEXITY = 3; - private final static int MIN_DANGLING_TAIL_LENGTH = 5; // SNP + 3 stabilizing nodes + the LCA - - /** for debugging info printing */ - private static int counter = 0; - - /** - * Sequences added for read threading before we've actually built the graph - */ - private final Map> pending = new LinkedHashMap<>(); - - /** - * A set of non-unique kmers that cannot be used as merge points in the graph - */ - protected Set nonUniqueKmers; - - /** - * A map from kmers -> their corresponding vertex in the graph - */ - protected Map uniqueKmers = new LinkedHashMap<>(); - - /** - * - */ - - final boolean debugGraphTransformations; - final byte minBaseQualityToUseInAssembly; - - protected boolean increaseCountsBackwards = true; - protected boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds - - // -------------------------------------------------------------------------------- - // state variables, initialized in resetToInitialState() - // -------------------------------------------------------------------------------- - private Kmer refSource; - protected boolean alreadyBuilt; - - /** - * Constructs an empty read-threading-grpah provided the kmerSize. - * @param kmerSize 1 or greater. - * - * @throw IllegalArgumentException if (@code kmerSize) < 1. - */ - public ReadThreadingGraph(final int kmerSize) { - this(kmerSize, false, (byte)6, 1); - } - - - /** - * Return the collection of outgoing vertices that expand this vertex with a particular base. - * - * @param v original vertex. - * @param b expanding base. - * @return never null, but perhaps an empty set. You cannot assume that you can modify the result. - */ - protected Set getNextVertices(final MultiDeBruijnVertex v, final byte b) { - if (v == null) throw new IllegalArgumentException("the input vertex cannot be null"); - if (!vertexSet().contains(v)) throw new IllegalArgumentException("the vertex must be present in the graph"); - final List result = new LinkedList<>(); - for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) { - if (w.getSuffix() == b) - result.add(w); - } - switch (result.size()) { - case 0: return Collections.emptySet(); - case 1: return Collections.singleton(result.get(0)); - default: - return new HashSet<>(result); - } - } - - /** - * Create a new ReadThreadingAssembler using kmerSize for matching - * @param kmerSize must be >= 1 - */ - protected ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples) { - super(kmerSize, new MyEdgeFactory(numPruningSamples)); - - if ( kmerSize < 1 ) throw new IllegalArgumentException("bad minkKmerSize " + kmerSize); - this.debugGraphTransformations = debugGraphTransformations; - this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; - - resetToInitialState(); - } - - /** - * Reset this assembler to its initial state, so we can create another assembly with a different set of reads - */ - private void resetToInitialState() { - pending.clear(); - nonUniqueKmers = null; - uniqueKmers.clear(); - refSource = null; - alreadyBuilt = false; - } - - /** - * Add the all bases in sequence to the graph - * @param sequence a non-null sequence - * @param isRef is this the reference sequence? - */ - protected void addSequence(final byte[] sequence, final boolean isRef) { - addSequence("anonymous", sequence, null, isRef); - } - - /** - * Add all bases in sequence to this graph - * - * @see #addSequence(String, String, byte[], int, int, int[], boolean) for full information - */ - public void addSequence(final String seqName, final byte[] sequence, final int[] counts, final boolean isRef) { - addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, counts, isRef); - } - - /** - * Add bases in sequence to this graph - * - * @param seqName a useful seqName for this read, for debugging purposes - * @param sequence non-null sequence of bases - * @param counts a vector of counts for each bases, indicating how many times that base was observed in the sequence. - * This allows us to support reduced reads in the ReadThreadingAssembler. Can be null, meaning that - * each base is only observed once. If not null, must have length == sequence.length. - * @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive - * @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive - * @param isRef is this the reference sequence. - */ - public void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int[] counts, final boolean isRef) { - // note that argument testing is taken care of in SequenceForKmers - if ( alreadyBuilt ) throw new IllegalStateException("Graph already built"); - - // get the list of sequences for this sample - List sampleSequences = pending.get(sampleName); - if ( sampleSequences == null ) { // need to create - sampleSequences = new LinkedList<>(); - pending.put(sampleName, sampleSequences); - } - - // add the new sequence to the list of sequences for sample - sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, counts, isRef)); - } - - /** - * Return a count appropriate for a kmer starting at kmerStart in sequence for kmers - * - * @param seqForKmers a non-null sequence for kmers object - * @param kmerStart the position where the kmer starts in sequence - * @return a count for a kmer from start -> start + kmerSize in seqForKmers - */ - private int getCountGivenKmerStart(final SequenceForKmers seqForKmers, final int kmerStart) { - return seqForKmers.getCount(kmerStart + kmerSize - 1); - } - - /** - * Thread sequence seqForKmers through the current graph, updating the graph as appropriate - * @param seqForKmers a non-null sequence - */ - private void threadSequence(final SequenceForKmers seqForKmers) { - final Pair startingInfo = findStart(seqForKmers); - if ( startingInfo == null ) - return; - - final MultiDeBruijnVertex startingVertex = startingInfo.getFirst(); - final int uniqueStartPos = startingInfo.getSecond(); - - // increase the counts of all edges incoming into the starting vertex supported by going back in sequence - if ( increaseCountsBackwards ) - increaseCountsInMatchedKmers(seqForKmers, startingVertex, startingVertex.getSequence(), kmerSize - 2); - - if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name); - - // keep track of information about the reference source - if ( seqForKmers.isRef ) { - if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex); - refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize); - } - - // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate - MultiDeBruijnVertex vertex = startingVertex; - for ( int i = uniqueStartPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) { - final int count = getCountGivenKmerStart(seqForKmers, i); - - vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef); - if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name); - } - } - - /** - * Class to keep track of the important dangling tail merging data - */ - protected final class DanglingTailMergeResult { - final List danglingPath, referencePath; - final byte[] danglingPathString, referencePathString; - final Cigar cigar; - - public DanglingTailMergeResult(final List danglingPath, - final List referencePath, - final byte[] danglingPathString, - final byte[] referencePathString, - final Cigar cigar) { - this.danglingPath = danglingPath; - this.referencePath = referencePath; - this.danglingPathString = danglingPathString; - this.referencePathString = referencePathString; - this.cigar = cigar; - } - } - - /** - * Attempt to attach vertex with out-degree == 0 to the graph - * - * @param vertex the vertex to recover - * @param pruneFactor the prune factor to use in ignoring chain pieces - * @return 1 if we successfully recovered the vertex and 0 otherwise - */ - protected int recoverDanglingChain(final MultiDeBruijnVertex vertex, final int pruneFactor) { - if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); - - // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths - final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex, pruneFactor); - - // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path - if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) ) - return 0; - - // merge - return mergeDanglingTail(danglingTailMergeResult); - } - - /** - * Determine whether the provided cigar is okay to merge into the reference path - * - * @param cigar the cigar to analyze - * @return true if it's okay to merge, false otherwise - */ - protected boolean cigarIsOkayToMerge(final Cigar cigar) { - - final List elements = cigar.getCigarElements(); - final int numElements = elements.size(); - - // don't allow more than a couple of different ops - if ( numElements > MAX_CIGAR_COMPLEXITY ) - return false; - - // the last element must be an M - if ( elements.get(numElements - 1).getOperator() != CigarOperator.M ) - return false; - - // TODO -- do we want to check whether the Ms mismatch too much also? - - return true; - } - - /** - * Actually merge the dangling tail if possible - * - * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference - * @return 1 if merge was successful, 0 otherwise - */ - protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) { - - final List elements = danglingTailMergeResult.cigar.getCigarElements(); - final CigarElement lastElement = elements.get(elements.size() - 1); - if ( lastElement.getOperator() != CigarOperator.M ) - throw new IllegalArgumentException("The last Cigar element must be an M"); - - final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; - final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); - if ( matchingSuffix == 0 ) - return 0; - - final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); - - // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a - // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling - // tail is a perfect match to the suffix of the reference path. In this case we need to push the reference index to merge - // down one position so that we don't incorrectly cut a base off of the deletion. - final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D; - final boolean mustHandleLeadingDeletionCase = firstElementIsDeletion && (elements.get(0).getLength() + matchingSuffix == lastRefIndex + 1); - final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0); - - addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); - - return 1; - } - - /** - * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the - * provided vertex is the sink) and the reference path. - * - * @param vertex the sink of the dangling tail - * @param pruneFactor the prune factor to use in ignoring chain pieces - * @return a SmithWaterman object which can be null if no proper alignment could be generated - */ - protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) { - - // find the lowest common ancestor path between vertex and the reference sink if available - final List altPath = findPathToLowestCommonAncestorOfReference(vertex, pruneFactor); - if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH ) - return null; - - // now get the reference path from the LCA - final List refPath = getReferencePath(altPath.get(0)); - - // create the Smith-Waterman strings to use - final byte[] refBases = getBasesForPath(refPath); - final byte[] altBases = getBasesForPath(altPath); - - // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) - final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL); - return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); - } - - /** - * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex. - * Note that nodes are excluded if their pruning weight is less than the pruning factor. - * - * @param vertex the original vertex - * @param pruneFactor the prune factor to use in ignoring chain pieces - * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or - * has an ancestor with multiple incoming edges before hitting the reference path - */ - protected List findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { - final LinkedList path = new LinkedList<>(); - - MultiDeBruijnVertex v = vertex; - while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { - final MultiSampleEdge edge = incomingEdgeOf(v); - // if it has too low a weight, don't use it (or previous vertexes) for the path - if ( edge.getPruningMultiplicity() < pruneFactor ) - path.clear(); - // otherwise it is safe to use - else - path.addFirst(v); - v = getEdgeSource(edge); - } - path.addFirst(v); - - return isReferenceNode(v) ? path : null; - } - - /** - * Finds the path downwards in the graph from this vertex to the reference sink, including this vertex - * - * @param start the reference vertex to start from - * @return the path (non-null, non-empty) - */ - protected List getReferencePath(final MultiDeBruijnVertex start) { - if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path"); - - final List path = new ArrayList<>(); - - MultiDeBruijnVertex v = start; - while ( v != null ) { - path.add(v); - v = getNextReferenceVertex(v); - } - - return path; - } - - /** - * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have - * been added to the graph. - */ - public void buildGraphIfNecessary() { - if ( alreadyBuilt ) return; - - // determine the kmer size we'll use, and capture the set of nonUniques for that kmer size - final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize); - nonUniqueKmers = result.nonUniques; - - if ( DEBUG_NON_UNIQUE_CALC ) { - logger.info("using " + kmerSize + " kmer size for this assembly with the following non-uniques"); - } - - // go through the pending sequences, and add them to the graph - for ( final List sequencesForSample : pending.values() ) { - for ( final SequenceForKmers sequenceForKmers : sequencesForSample ) { - threadSequence(sequenceForKmers); - if ( WRITE_GRAPH ) printGraph(new File("threading." + counter++ + "." + sequenceForKmers.name.replace(" ", "_") + ".dot"), 0); - } - - // flush the single sample edge values from the graph - for ( final MultiSampleEdge e : edgeSet() ) e.flushSingleSampleMultiplicity(); - } - - // clear - pending.clear(); - alreadyBuilt = true; - for (final MultiDeBruijnVertex v : uniqueKmers.values()) - v.setAdditionalInfo(v.additionalInfo() + "+"); - } - - - @Override - public boolean removeVertex(MultiDeBruijnVertex V) { - final boolean result = super.removeVertex(V); - if (result) { - final byte[] sequence = V.getSequence(); - final Kmer kmer = new Kmer(sequence); - uniqueKmers.remove(kmer); - } - return result; - } - - - public void removeSingletonOrphanVertices() { - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new LinkedList<>(); - for( final MultiDeBruijnVertex v : vertexSet() ) { - if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - this.removeVertex(null); - removeAllVertices(verticesToRemove); - } - - /** - * @return true if the graph has cycles, false otherwise - */ - public boolean hasCycles() { - return new CycleDetector<>(this).detectCycles(); - } - - /** - * Does the graph not have enough complexity? We define low complexity as a situation where the number - * of non-unique kmers is more than 20% of the total number of kmers. - * - * @return true if the graph has low complexity, false otherwise - */ - public boolean isLowComplexity() { - return nonUniqueKmers.size() * 4 > uniqueKmers.size(); - } - - /** - * Try to recover dangling tails - * - * @param pruneFactor the prune factor to use in ignoring chain pieces - */ - public void recoverDanglingTails(final int pruneFactor) { - if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); - - int attempted = 0; - int nRecovered = 0; - for ( final MultiDeBruijnVertex v : vertexSet() ) { - if ( outDegreeOf(v) == 0 && ! isRefSink(v) ) { - attempted++; - nRecovered += recoverDanglingChain(v, pruneFactor); - } - } - - if ( debugGraphTransformations ) logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails"); - } - - /** structure that keeps track of the non-unique kmers for a given kmer size */ - private static class NonUniqueResult { - final Set nonUniques; - final int kmerSize; - - private NonUniqueResult(Set nonUniques, int kmerSize) { - this.nonUniques = nonUniques; - this.kmerSize = kmerSize; - } - } - - /** - * Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers - * among all sequences added to the current graph. Will always return a result for maxKmerSize if - * all smaller kmers had non-unique kmers. - * - * @param minKmerSize the minimum kmer size to consider when constructing the graph - * @param maxKmerSize the maximum kmer size to consider - * @return a non-null NonUniqueResult - */ - protected NonUniqueResult determineKmerSizeAndNonUniques(final int minKmerSize, final int maxKmerSize) { - final Collection withNonUniques = getAllPendingSequences(); - final Set nonUniqueKmers = new HashSet(); - - // go through the sequences and determine which kmers aren't unique within each read - int kmerSize = minKmerSize; - for ( ; kmerSize <= maxKmerSize; kmerSize++) { - // clear out set of non-unique kmers - nonUniqueKmers.clear(); - - // loop over all sequences that have non-unique kmers in them from the previous iterator - final Iterator it = withNonUniques.iterator(); - while ( it.hasNext() ) { - final SequenceForKmers sequenceForKmers = it.next(); - - // determine the non-unique kmers for this sequence - final Collection nonUniquesFromSeq = determineNonUniqueKmers(sequenceForKmers, kmerSize); - if ( nonUniquesFromSeq.isEmpty() ) { - // remove this sequence from future consideration - it.remove(); - } else { - // keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques - nonUniqueKmers.addAll(nonUniquesFromSeq); - } - } - - if ( nonUniqueKmers.isEmpty() ) - // this kmerSize produces no non-unique sequences, so go ahead and use it for our assembly - break; - } - - // necessary because the loop breaks with kmerSize = max + 1 - return new NonUniqueResult(nonUniqueKmers, Math.min(kmerSize, maxKmerSize)); - } - - /** - * Get the collection of all sequences for kmers across all samples in no particular order - * @return non-null Collection - */ - private Collection getAllPendingSequences() { - final LinkedList result = new LinkedList(); - for ( final List oneSampleWorth : pending.values() ) result.addAll(oneSampleWorth); - return result; - } - - /** - * Get the collection of non-unique kmers from sequence for kmer size kmerSize - * @param seqForKmers a sequence to get kmers from - * @param kmerSize the size of the kmers - * @return a non-null collection of non-unique kmers in sequence - */ - private Collection determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { - // count up occurrences of kmers within each read - final KMerCounter counter = new KMerCounter(kmerSize); - final int stopPosition = seqForKmers.stop - kmerSize; - for ( int i = 0; i <= stopPosition; i++ ) { - final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); - counter.addKmer(kmer, 1); - } - - return counter.getKmersWithCountsAtLeast(2); - } - - /** - * Convert this kmer graph to a simple sequence graph. - * - * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer - * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence - * - * @return a newly allocated SequenceGraph - */ - // TODO -- should override base class method - public SeqGraph convertToSequenceGraph() { - buildGraphIfNecessary(); - - final SeqGraph seqGraph = new SeqGraph(kmerSize); - final Map vertexMap = new HashMap(); - - - // create all of the equivalent seq graph vertices - for ( final MultiDeBruijnVertex dv : vertexSet() ) { - final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); - sv.setAdditionalInfo(dv.additionalInfo()); - vertexMap.put(dv, sv); - seqGraph.addVertex(sv); - } - - // walk through the nodes and connect them to their equivalent seq vertices - for( final MultiSampleEdge e : edgeSet() ) { - final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); - final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); - //logger.info("Adding edge " + seqInV + " -> " + seqOutV); - seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity())); - } - - return seqGraph; - } - - private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, - final MultiDeBruijnVertex vertex, - final byte[] originalKmer, - final int offset) { - if ( offset == -1 ) return; - - for ( final MultiSampleEdge edge : incomingEdgesOf(vertex) ) { - final MultiDeBruijnVertex prev = getEdgeSource(edge); - final byte suffix = prev.getSuffix(); - final byte seqBase = originalKmer[offset]; -// logger.warn(String.format("Increasing counts for %s -> %s via %s at %d with suffix %s vs. %s", -// prev, vertex, edge, offset, (char)suffix, (char)seqBase)); - if ( suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1) ) { - edge.incMultiplicity(seqForKmers.getCount(offset)); - increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset-1); - } - } - } - - /** - * Find vertex and its position in seqForKmers where we should start assembling seqForKmers - * - * @param seqForKmers the sequence we want to thread into the graph - * @return a pair of the starting vertex and its position in seqForKmer - */ - protected Pair findStart(final SequenceForKmers seqForKmers) { - final int uniqueStartPos = seqForKmers.isRef ? 0 : findUniqueStartPosition(seqForKmers.sequence, seqForKmers.start, seqForKmers.stop); - - if ( uniqueStartPos == -1 ) - return null; - - return getOrCreateKmerVertex(seqForKmers.sequence, uniqueStartPos, true); - } - - /** - * Find a starting point in sequence that begins a unique kmer among all kmers in the graph - * @param sequence the sequence of bases - * @param start the first base to use in sequence - * @param stop the last base to use in sequence - * @return the index into sequence that begins a unique kmer of size kmerSize, or -1 if none could be found - */ - private int findUniqueStartPosition(final byte[] sequence, final int start, final int stop) { - for ( int i = start; i < stop - kmerSize; i++ ) { - final Kmer kmer1 = new Kmer(sequence, i, kmerSize); - if ( uniqueKmers.containsKey(kmer1) ) - return i; - } - return -1; - } - - /** - * Get the vertex for the kmer in sequence starting at start - * @param sequence the sequence - * @param start the position of the kmer start - * @param allowRefSource if true, we will allow matches to the kmer that represents the reference starting kmer - * @return a non-null vertex - */ - protected Pair getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) { - final Kmer kmer = new Kmer(sequence, start, kmerSize); - final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, allowRefSource); - if ( vertex != null ) { - return new Pair<>(vertex, start); - } else { - return new Pair<>(createVertex(kmer), start); - } - } - - /** - * Get the unique vertex for kmer, or null if not possible. - * - * @param allowRefSource if true, we will allow kmer to match the reference source vertex - * @return a vertex for kmer, or null if it's not unique - */ - private MultiDeBruijnVertex getUniqueKmerVertex(final Kmer kmer, final boolean allowRefSource) { - if ( ! allowRefSource && kmer.equals(refSource) ) return null; - - return uniqueKmers.get(kmer); - } - - - /** - * Create a new vertex for kmer. Add it to the uniqueKmers map if appropriate. - * - * kmer must not have a entry in unique kmers, or an error will be thrown - * - * @param kmer the kmer we want to create a vertex for - * @return the non-null created vertex - */ - private MultiDeBruijnVertex createVertex(final Kmer kmer) { - final MultiDeBruijnVertex newVertex = new MultiDeBruijnVertex(kmer.bases()); - final int prevSize = vertexSet().size(); - addVertex(newVertex); - - // make sure we aren't adding duplicates (would be a bug) - if ( vertexSet().size() != prevSize + 1) throw new IllegalStateException("Adding vertex " + newVertex + " to graph didn't increase the graph size"); - - // add the vertex to the unique kmer map, if it is in fact unique - if ( ! nonUniqueKmers.contains(kmer) && ! uniqueKmers.containsKey(kmer) ) // TODO -- not sure this last test is necessary - uniqueKmers.put(kmer, newVertex); - - return newVertex; - } - - /** - * Workhorse routine of the assembler. Given a sequence whose last vertex is anchored in the graph, extend - * the graph one bp according to the bases in sequence. - * - * @param prevVertex a non-null vertex where sequence was last anchored in the graph - * @param sequence the sequence we're threading through the graph - * @param kmerStart the start of the current kmer in graph we'd like to add - * @param count the number of observations of this kmer in graph (can be > 1 for reduced reads) - * @param isRef is this the reference sequence? - * @return a non-null vertex connecting prevVertex to in the graph based on sequence - */ - private MultiDeBruijnVertex extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef) { - final Set outgoingEdges = outgoingEdgesOf(prevVertex); - - final int nextPos = kmerStart + kmerSize - 1; - for ( final MultiSampleEdge outgoingEdge : outgoingEdges ) { - final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge); - if ( target.getSuffix() == sequence[nextPos] ) { - // we've got a match in the chain, so simply increase the count of the edge by 1 and continue - outgoingEdge.incMultiplicity(count); - return target; - } - } - - // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in - final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize); - final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false); - - if ( isRef && uniqueMergeVertex != null ) - throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex); - - // either use our unique merge vertex, or create a new one in the chain - final MultiDeBruijnVertex nextVertex = uniqueMergeVertex == null ? createVertex(kmer) : uniqueMergeVertex; - addEdge(prevVertex, nextVertex, ((MyEdgeFactory)getEdgeFactory()).createEdge(isRef, count)); - return nextVertex; - } - - /** - * Add the given read to the sequence graph. Ultimately the read will get sent through addSequence(), but first - * this method ensures we only use high quality bases and accounts for reduced reads, etc. - * - * @param read a non-null read - */ - protected void addRead(final GATKSAMRecord read) { - final byte[] sequence = read.getReadBases(); - final byte[] qualities = read.getBaseQualities(); - final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - - int lastGood = -1; // the index of the last good base we've seen - for( int end = 0; end <= sequence.length; end++ ) { - if ( end == sequence.length || ! baseIsUsableForAssembly(sequence[end], qualities[end]) ) { - // the first good base is at lastGood, can be -1 if last base was bad - final int start = lastGood; - // the stop base is end - 1 (if we're not at the end of the sequence) - final int len = end - start; - - if ( start != -1 && len >= kmerSize ) { - // if the sequence is long enough to get some value out of, add it to the graph - final String name = read.getReadName() + "_" + start + "_" + end; - addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false); - } - - lastGood = -1; // reset the last good base - } else if ( lastGood == -1 ) { - lastGood = end; // we're at a good base, the last good one is us - } - } - } - - /** - * Determines whether a base can safely be used for assembly. - * Currently disallows Ns and/or those with low quality - * - * @param base the base under consideration - * @param qual the quality of that base - * @return true if the base can be used for assembly, false otherwise - */ - protected boolean baseIsUsableForAssembly(final byte base, final byte qual) { - return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly; - } - - /** - * Get the set of non-unique kmers in this graph. For debugging purposes - * @return a non-null set of kmers - */ - protected Set getNonUniqueKmers() { - return nonUniqueKmers; - } - - @Override - public String toString() { - return "ReadThreadingAssembler{" + - "kmerSize=" + kmerSize + - '}'; - } - - - @Override - public MultiDeBruijnVertex findKmer(final Kmer k) { - return uniqueKmers.get(k); - } - - /************************************************************* - * Simple string representation support for testing purposes * - *************************************************************/ - - private static final Pattern PROPERTIES_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*\\]"); - private static final Pattern PATH_PATTERN = Pattern.compile("\\{((\\S+):)?([^\\}]*)\\}"); - private static final Pattern KMERSIZE_EXTRACTOR_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*(ks|kmerSize)\\s*=\\s*(\\d+)\\s*[,\\]]"); - - - /** - * Constructs a read-threadingg-graph for a string representation. - * - *

- * Note: only used for testing. - * Checkout {@link HaplotypeGraphUnitTest} for examples. - *

- * @param s the string representation of the graph {@code null}. - */ - public ReadThreadingGraph(final String s) { - super(kmerSizeFromString(s),new MyEdgeFactory(1)); - debugGraphTransformations = false; - minBaseQualityToUseInAssembly = 0; - applyString(s); - alreadyBuilt = true; - } - - /** - * Obtain the kmer size for the string representation. - * @param str the source string representation. - * @return 1 or greater. - * @throws IllegalArgumentException if {@code} str does not contain a valid representation. - */ - private static int kmerSizeFromString(final String str) { - final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str); - if (matcher.find()) { - return Integer.parseInt(matcher.group(2)); - } else - throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize"); - } - - /** - * Apply description string into the graph. - * - *

- * Note: this is done just for testing purposes. - * Checkout {@link HaplotypeGraphUnitTest} for examples. - *

- * @param str the string representation. - */ - private void applyString(final String str) { - final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str); - final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0; - - final String pathString = str.substring(pathStart); - final Matcher pathMatcher = PATH_PATTERN.matcher(pathString); - - boolean referenceFound = false; - final Map vertexById = new HashMap<>(); - - // Loop between path strings and add them one by one. - while (pathMatcher.find()) { - final String label = pathMatcher.group(2); - final boolean isReference = (label != null && label.equals("REF")); - if (referenceFound) { - if (isReference) - throw new IllegalArgumentException("there are two reference paths"); - - } else - referenceFound |= isReference; - - // Divide each path into its elements getting a list of sequences and labels if applies: - final String elementsString = pathMatcher.group(3); - final String[] elements = elementsString.split("\\s*->\\s*"); - if (elements.length == 0) - throw new IllegalArgumentException("empty path not allowed"); - final String[] seqs = new String[elements.length]; - final String[] ids = new String[elements.length]; - for (int i = 0; i < elements.length; i++) { - ids[i] = pathElementId(elements[i]); - seqs[i] = pathElementSeq(elements[i]); - if (seqs[i].isEmpty() && ids[i] == null) - throw new IllegalArgumentException("path with empty element without an id"); - } - final boolean isSource = ids[0] == null || !vertexById.containsKey(ids[0]); - if (isSource && seqs[0].length() != kmerSize) - throw new IllegalArgumentException("source sequence length must be the same as the kmerSize " - + ids[0] + " " + seqs[0] + " " + pathMatcher.group()); - final MultiDeBruijnVertex firstVertex; - if (ids[0] != null && vertexById.containsKey(ids[0])) - firstVertex = vertexById.get(ids[0]); - else { - firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes()); - addVertex(firstVertex); - if (ids[0] != null) - vertexById.put(ids[0],firstVertex); - } - if (!seqs[0].isEmpty() && - ((isSource && !firstVertex.getSequenceString().equals(seqs[0])) - || (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0]))) - throw new IllegalArgumentException("mismatched first element sequence"); - - MultiDeBruijnVertex lastVertex = firstVertex; - for (int i = 1; i < elements.length; i++) { - if (seqs[i].length() > 1) - throw new IllegalArgumentException("non-source vertex sequence must have length 1"); - final MultiDeBruijnVertex nextVertex; - if (ids[i] == null || !vertexById.containsKey(ids[i])) { - final Set nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]); - if (nextVertices.size() == 0) { - nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0])); - addVertex(nextVertex); - } else { - nextVertex = nextVertices.iterator().next(); - } - if (ids[i] != null) - vertexById.put(ids[i],nextVertex); - } else { - nextVertex = vertexById.get(ids[i]); - } - final MultiSampleEdge edge = addEdge(lastVertex,nextVertex); - if (isReference) edge.setIsRef(true); - lastVertex = nextVertex; - } - } - } - - private static String pathElementId(final String element) { - final int parentesysPos = element.indexOf('('); - - if (parentesysPos == -1) - return null; - - final int closeParentesysPos = element.lastIndexOf(')'); - if (closeParentesysPos == -1) - throw new IllegalArgumentException("non-closed id parantesys found in element: " + element); - final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim(); - if (result.isEmpty()) - throw new IllegalArgumentException("empty id found in element: " + element); - return result; - } - - /** - * Returns the lenght of a path element in the string representation. - * @param element the query element. - * @return 0 or greater. - */ - private static String pathElementSeq(final String element) { - final int parentesysPos = element.indexOf('('); - - if (parentesysPos == -1) - return element.trim(); - - return element.substring(0,parentesysPos).trim(); - } - - /** - * Add a base to the end of a byte sequence. - * @param sequence sequence where to add the base to. - * @param b base to add. - * @return never {@code null}, a new array each time. - */ - private static byte[] extendSequence(final byte[] sequence, final byte b) { - final byte[] result = new byte[sequence.length]; - System.arraycopy(sequence,1,result,0,sequence.length - 1); - result[result.length - 1] = b; - return result; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java deleted file mode 100644 index a4bc0c1c8..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java +++ /dev/null @@ -1,93 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -/** - * Keeps track of the information needed to add a sequence to the read threading assembly graph - * - * User: depristo - * Date: 4/18/13 - * Time: 8:59 AM - * To change this template use File | Settings | File Templates. - */ -final class SequenceForKmers { - final String name; - final byte[] sequence; - final int start, stop; - final private int[] counts; - final boolean isRef; - - /** - * Create a new sequence for creating kmers - */ - SequenceForKmers(final String name, byte[] sequence, int start, int stop, int[] counts, boolean ref) { - if ( start < 0 ) throw new IllegalArgumentException("Invalid start " + start); - if ( stop < start ) throw new IllegalArgumentException("Invalid stop " + stop); - if ( sequence == null ) throw new IllegalArgumentException("Sequence is null "); - if ( counts != null && counts.length != sequence.length ) throw new IllegalArgumentException("Sequence and counts don't have the same length " + sequence.length + " vs " + counts.length); - - this.name = name; - this.sequence = sequence; - this.start = start; - this.stop = stop; - this.isRef = ref; - this.counts = counts; - } - - /** - * Get the number of observations of the kmer starting at i in this sequence - * - * Can we > 1 because sequence may be a reduced read and therefore count as N observations - * - * @param i the offset into sequence for the start of the kmer - * @return a count >= 1 that indicates the number of observations of kmer starting at i in this sequence. - */ - public int getCount(final int i) { - if ( i < 0 || i > sequence.length ) throw new ArrayIndexOutOfBoundsException("i must be >= 0 and <= " + sequence.length + " but got " + i); - return counts == null ? 1 : counts[i]; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java deleted file mode 100644 index c0848663e..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ /dev/null @@ -1,1611 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; -import net.sf.samtools.util.RuntimeIOException; -import net.sf.samtools.util.SequenceUtil; -import net.sf.samtools.util.StringUtil; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.BAQMode; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.smithwaterman.Parameters; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.*; - -/** - * Performs local realignment of reads to correct misalignments due to the presence of indels. - * - *

- * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases - * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion - * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching - * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, - * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are - * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, - * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus - * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and - * specifically identify indels. - *

- *
    There are 2 steps to the realignment process: - *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. - *
  3. Running the realigner over those intervals (IndelRealigner)
  4. - *
- *

- * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 - *

- * - *

Input

- *

- * One or more aligned BAM files and optionally one or more lists of known indels. - *

- * - *

Output

- *

- * A realigned version of your input BAM file(s). - *

- * - *

Example

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T IndelRealigner \
- *   -R ref.fasta \
- *   -I input.bam \
- *   -targetIntervals intervalListFromRTC.intervals \
- *   -o realignedBam.bam \
- *   [-known /path/to/indels.vcf] \
- *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
- * 
- * - *

Caveats

- * - *
  • - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. - *
  • - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). - *
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) -public class IndelRealigner extends ReadWalker { - - public static final String ORIGINAL_CIGAR_TAG = "OC"; - public static final String ORIGINAL_POSITION_TAG = "OP"; - public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; - - public enum ConsensusDeterminationModel { - /** - * Uses only indels from a provided ROD of known indels. - */ - KNOWNS_ONLY, - /** - * Additionally uses indels already present in the original alignments of the reads. - */ - USE_READS, - /** - * Additionally uses 'Smith-Waterman' to generate alternate consenses. - */ - USE_SW - } - - /** - * Any number of VCF files representing known indels to be used for constructing alternate consenses. - * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. - */ - @Input(fullName="knownAlleles", shortName = "known", doc="Input VCF file(s) with known indels", required=false) - public List> known = Collections.emptyList(); - - /** - * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). - */ - @Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true) - protected IntervalBinding intervalsFile = null; - - /** - * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number - * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, - * this number should be smaller. - */ - @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) - protected double LOD_THRESHOLD = 5.0; - - /** - * The realigned bam file. - */ - @Output(required=false, doc="Output bam", defaultToStdout=false) - protected StingSAMFileWriter writer = null; - protected ConstrainedMateFixingManager manager = null; - protected SAMFileWriter writerToUse = null; - - /** - * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; - * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). - */ - @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) - public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; - - - // ADVANCED OPTIONS FOLLOW - - /** - * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner - * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply - * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. - */ - @Advanced - @Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false) - protected double MISMATCH_THRESHOLD = 0.15; - - /** - * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; - * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. - */ - @Advanced - @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) - protected int MAX_RECORDS_IN_MEMORY = 150000; - - /** - * For expert users only! - */ - @Advanced - @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) - protected int MAX_ISIZE_FOR_MOVEMENT = 3000; - - /** - * For expert users only! - */ - @Advanced - @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false) - protected int MAX_POS_MOVE_ALLOWED = 200; - - /** - * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. - */ - @Advanced - @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) - protected int MAX_CONSENSUSES = 30; - - /** - * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. - */ - @Advanced - @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) - protected int MAX_READS_FOR_CONSENSUSES = 120; - - /** - * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. - * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. - */ - @Advanced - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false) - protected int MAX_READS = 20000; - - @Advanced - @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") - protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; - - /** - * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that - * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), - * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the - * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output - * file name (2nd column) for each input file name (1st column). - * - * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). - */ - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)") - protected String N_WAY_OUT = null; - - @Hidden - @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs") - protected boolean generateMD5s = false; - - // DEBUGGING OPTIONS FOLLOW - - @Hidden - @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") - protected boolean CHECKEARLY = false; - - @Hidden - @Argument(fullName="noPGTag", shortName="noPG", required=false, - doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") - protected boolean NO_PG_TAG = false; - - @Hidden - @Argument(fullName="keepPGTags", shortName="keepPG", required=false, - doc="Keep older PG tags left in the bam header by previous runs of this tool (by default, all these "+ - "historical tags will be replaced by the latest tag generated in the current run).") - protected boolean KEEP_ALL_PG_RECORDS = false; - - @Hidden - @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") - protected String OUT_INDELS = null; - - @Hidden - @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) - protected String OUT_STATS = null; - - @Hidden - @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) - protected String OUT_SNPS = null; - - // fasta reference reader to supplement the edges of the reference sequence - private CachingIndexedFastaSequenceFile referenceReader; - - // the intervals input by the user - private Iterator intervals = null; - - // the current interval in the list - private GenomeLoc currentInterval = null; - private boolean sawReadInCurrentInterval = false; - - // the reads and known indels that fall into the current interval - private ReadBin readsToClean; - private final ArrayList readsNotToClean = new ArrayList(); - private final ArrayList knownIndelsToTry = new ArrayList(); - private final HashSet indelRodsSeen = new HashSet(); - private final HashSet readsActuallyCleaned = new HashSet(); - - private static final int MAX_QUAL = 99; - - // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned - private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; - - private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0); - - // reference base padding size - // TODO -- make this a command-line argument if the need arises - private static final int REFERENCE_PADDING = 30; - - // other output files - private FileWriter indelOutput = null; - private FileWriter statsOutput = null; - private FileWriter snpsOutput = null; - - //###protected Map nwayWriters = null; - - - // debug info for lazy SW evaluation: - private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had - private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment - private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus) - - private Map loadFileNameMap(String mapFile) { - Map fname_map = new HashMap(); - - try { - - XReadLines reader = new XReadLines(new File(mapFile),true); - for ( String line : reader ) { - if ( line.length() == 0 ) continue; - - String fields[] = line.split("\t"); - - if ( fields.length != 2 ) - throw new UserException.BadInput("Input-output map file must have exactly two columns. Offending line:\n"+line); - if ( fields[0].length() == 0 || fields[1].length() == 0 ) - throw new UserException.BadInput("Input-output map file can not have empty strings in either column. Offending line:\n"+line); - - if ( fname_map.containsKey(fields[0]) ) - throw new UserException.BadInput("Input-output map file contains duplicate entries for input name "+fields[0]); - if ( fname_map.containsValue(fields[1]) ) - throw new UserException.BadInput("Input-output map file maps multiple entries onto single output name "+fields[1]); - - fname_map.put(fields[0],fields[1]); - } - } catch (IOException e) { - throw new StingException("I/O Error while reading input-output map file "+N_WAY_OUT+": "+e.getMessage()); - } - return fname_map; - } - - public void initialize() { - readsToClean = new ReadBin(getToolkit().getGenomeLocParser(), REFERENCE_PADDING); - - if ( N_WAY_OUT == null && writer == null ) { - throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); - } - if ( N_WAY_OUT != null && writer != null ) { - throw new UserException.CommandLineException("-o and -nWayOut can not be used simultaneously"); - } - if ( LOD_THRESHOLD < 0.0 ) - throw new RuntimeException("LOD threshold cannot be a negative number"); - if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) - throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); - - try { - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); - } - - intervals = intervalsFile.getIntervals(getToolkit()).iterator(); - - currentInterval = intervals.hasNext() ? intervals.next() : null; - - if ( N_WAY_OUT != null ) { - boolean createIndex = true; - - if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { - writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT), - SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); - } else { - writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, - createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); - } - } else { - // set up the output writer - setupWriter(getToolkit().getSAMFileHeader()); - writerToUse = writer; - } - manager = new ConstrainedMateFixingManager(writerToUse, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY); - - if ( OUT_INDELS != null ) { - try { - indelOutput = new FileWriter(new File(OUT_INDELS)); - } catch (Exception e) { - logger.error("Failed to create output file "+ OUT_INDELS+". Indel output will be suppressed"); - logger.error(e.getMessage()); - indelOutput = null; - } - } - if ( OUT_STATS != null ) { - try { - statsOutput = new FileWriter(new File(OUT_STATS)); - } catch (Exception e) { - logger.error("Failed to create output file "+ OUT_STATS+". Cleaning stats output will be suppressed"); - logger.error(e.getMessage()); - statsOutput = null; - } - } - if ( OUT_SNPS != null ) { - try { - snpsOutput = new FileWriter(new File(OUT_SNPS)); - } catch (Exception e) { - logger.error("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed"); - logger.error(e.getMessage()); - snpsOutput = null; - } - } - } - - private void setupWriter(SAMFileHeader header) { - - if ( !NO_PG_TAG ) { - final SAMProgramRecord programRecord = createProgramRecord(); - - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - } - - writer.writeHeader(header); - writer.setPresorted(true); - } - - - private SAMProgramRecord createProgramRecord() { - if ( NO_PG_TAG ) return null; - - final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); - final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - try { - final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); - programRecord.setProgramVersion(version); - } catch (MissingResourceException e) { - // this is left empty on purpose (perhaps Andrey knows why?) - } - programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); - return programRecord; - } - - private void emit(final GATKSAMRecord read) { - - // check to see whether the read was modified by looking at the temporary tag - boolean wasModified = readsActuallyCleaned.contains(read); - - try { - manager.addRead(read, wasModified); - } catch (RuntimeIOException e) { - throw new UserException.ErrorWritingBamFile(e.getMessage()); - } - } - - private void emitReadLists() { - // pre-merge lists to sort them in preparation for constrained SAMFileWriter - readsNotToClean.addAll(readsToClean.getReads()); - ReadUtils.sortReadsByCoordinate(readsNotToClean); - manager.addReads(readsNotToClean, readsActuallyCleaned); - readsToClean.clear(); - readsNotToClean.clear(); - readsActuallyCleaned.clear(); - } - - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - if ( currentInterval == null ) { - emit(read); - return 0; - } - - // edge case: when the last target interval abuts the end of the genome, we'll get one of the - // unmapped reads while the currentInterval still isn't null. We need to trigger the cleaning - // at this point without trying to create a GenomeLoc. - if ( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ) { - cleanAndCallMap(ref, read, metaDataTracker, null); - return 0; - } - - GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read); - // hack to get around unmapped reads having screwy locations - if ( readLoc.getStop() == 0 ) - readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - - if ( readLoc.isBefore(currentInterval) ) { - if ( !sawReadInCurrentInterval ) - emit(read); - else - readsNotToClean.add(read); - } - else if ( readLoc.overlapsP(currentInterval) ) { - sawReadInCurrentInterval = true; - - if ( doNotTryToClean(read) ) { - readsNotToClean.add(read); - } else { - readsToClean.add(read); - - // add the rods to the list of known variants - populateKnownIndels(metaDataTracker); - } - - if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) { - logger.info("Not attempting realignment in interval " + currentInterval + " because there are too many reads."); - abortCleanForCurrentInterval(); - } - } - else { // the read is past the current interval - logger.debug(currentInterval.toString() + "\t" + read.getAlignmentStart() ); - cleanAndCallMap(ref, read, metaDataTracker, readLoc); - } - - return 0; - } - - private void abortCleanForCurrentInterval() { - emitReadLists(); - currentInterval = intervals.hasNext() ? intervals.next() : null; - sawReadInCurrentInterval = false; - } - - private boolean doNotTryToClean(GATKSAMRecord read) { - return read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - ConstrainedMateFixingManager.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT) || - ReadUtils.is454Read(read) || - ReadUtils.isIonRead(read); - // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses - } - - private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { - if ( readsToClean.size() > 0 ) { - GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); - if ( manager.canMoveReads(earliestPossibleMove) ) - clean(readsToClean); - } - knownIndelsToTry.clear(); - indelRodsSeen.clear(); - - emitReadLists(); - try { - do { - currentInterval = intervals.hasNext() ? intervals.next() : null; - - } while ( currentInterval != null && (readLoc == null || currentInterval.isBefore(readLoc)) ); - } catch (ReviewedStingException e) { - throw new UserException.MissortedFile(new File(intervalsFile.getSource()), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); - } - sawReadInCurrentInterval = false; - - // call back into map now that the state has been updated - map(ref, read, metaDataTracker); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - if ( readsToClean.size() > 0 ) { - GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); - if ( manager.canMoveReads(earliestPossibleMove) ) - clean(readsToClean); - emitReadLists(); - } else if ( readsNotToClean.size() > 0 ) { - emitReadLists(); - } - - knownIndelsToTry.clear(); - indelRodsSeen.clear(); - - if ( OUT_INDELS != null ) { - try { - indelOutput.close(); - } catch (Exception e) { - logger.error("Failed to close "+OUT_INDELS+" gracefully. Data may be corrupt."); - } - } - if ( OUT_STATS != null ) { - try { - statsOutput.close(); - } catch (Exception e) { - logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt."); - } - } - if ( OUT_SNPS != null ) { - try { - snpsOutput.close(); - } catch (Exception e) { - logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt."); - } - } - - manager.close(); - if ( N_WAY_OUT != null ) writerToUse.close(); - - if ( CHECKEARLY ) { - logger.info("SW alignments runs: "+SWalignmentRuns); - logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)"); - logger.info("SW alignments skipped (perfect match): "+exactMatchesFound); - logger.info("Total reads SW worked for: "+(SWalignmentSuccess + exactMatchesFound)+ - " ("+(SWalignmentSuccess+exactMatchesFound)/(SWalignmentRuns+exactMatchesFound)+"% of all reads requiring SW)"); - } - } - - private void populateKnownIndels(RefMetaDataTracker metaDataTracker) { - for ( final VariantContext vc : metaDataTracker.getValues(known) ) { - if ( indelRodsSeen.contains(vc) ) - continue; - indelRodsSeen.add(vc); - knownIndelsToTry.add(vc); - } - } - - private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex, int quitAboveThisValue) { - final byte[] readSeq = aRead.getReadBases(); - final byte[] quals = aRead.getBaseQualities(); - int sum = 0; - for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) { - if ( refIndex >= refSeq.length ) { - sum += MAX_QUAL; - // optimization: once we pass the threshold, stop calculating - if ( sum > quitAboveThisValue ) - return sum; - } else { - byte refChr = refSeq[refIndex]; - byte readChr = readSeq[readIndex]; - if ( !BaseUtils.isRegularBase(readChr) || !BaseUtils.isRegularBase(refChr) ) - continue; // do not count Ns/Xs/etc ? - if ( readChr != refChr ) { - sum += (int)quals[readIndex]; - // optimization: once we pass the threshold, stop calculating - if ( sum > quitAboveThisValue ) - return sum; - } - } - } - return sum; - } - - private void clean(ReadBin readsToClean) { - - final List reads = readsToClean.getReads(); - if ( reads.size() == 0 ) - return; - - byte[] reference = readsToClean.getReference(referenceReader); - int leftmostIndex = readsToClean.getLocation().getStart(); - - final ArrayList refReads = new ArrayList(); // reads that perfectly match ref - final ArrayList altReads = new ArrayList(); // reads that don't perfectly match - final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? - final Set altConsenses = new LinkedHashSet(); // list of alt consenses - - // if there are any known indels for this region, get them and create alternate consenses - generateAlternateConsensesFromKnownIndels(altConsenses, leftmostIndex, reference); - - // decide which reads potentially need to be cleaned; - // if there are reads with a single indel in them, add that indel to the list of alternate consenses - long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference); - - // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference, using totalRawMismatchSum as the random seed - if ( consensusModel == ConsensusDeterminationModel.USE_SW ) - generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex); - - // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); - - Consensus bestConsensus = null; - - for (Consensus consensus : altConsenses) { - //logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str)); - -// if ( DEBUG ) { -// System.out.println("Checking consensus with alignment at "+consensus.positionOnReference+" cigar "+consensus.cigar); -// System.out.println(new String(consensus.str)); -// int z = 0; -// for ( ; z < consensus.positionOnReference; z++ ) System.out.print('.'); -// for ( z=0 ; z < consensus.cigar.getCigarElement(0).getLength() ; z++ ) System.out.print('.'); -// if ( consensus.cigar.getCigarElement(1).getOperator() == CigarOperator.I ) for ( z= 0; z < consensus.cigar.getCigarElement(1).getLength(); z++ ) System.out.print('I'); -// System.out.println(); -// } - - // if ( debugOn ) System.out.println("Consensus: "+consensus.str); - - for (int j = 0; j < altReads.size(); j++) { - AlignedRead toTest = altReads.get(j); - Pair altAlignment = findBestOffset(consensus.str, toTest, leftmostIndex); - - // the mismatch score is the min of its alignment vs. the reference and vs. the alternate - int myScore = altAlignment.second; - - if (myScore > toTest.getAlignerMismatchScore() || myScore >= toTest.getMismatchScoreToReference()) - myScore = toTest.getMismatchScoreToReference(); - // keep track of reads that align better to the alternate consensus. - // By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het) - else - consensus.readIndexes.add(new Pair(j, altAlignment.first)); - - //logger.debug(consensus.cigar + " vs. " + toTest.getRead().getReadName() + "-" + toTest.getRead().getReadString() + " => " + myScore + " vs. " + toTest.getMismatchScoreToReference()); - if (!toTest.getRead().getDuplicateReadFlag()) - consensus.mismatchSum += myScore; - - // optimization: once the mismatch sum is higher than the best consensus, quit since this one can't win - // THIS MUST BE DISABLED IF WE DECIDE TO ALLOW MORE THAN ONE ALTERNATE CONSENSUS! - if (bestConsensus != null && consensus.mismatchSum > bestConsensus.mismatchSum) - break; - } - - //logger.debug("Mismatch sum of new consensus: " + consensus.mismatchSum); - if (bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { - // we do not need this alt consensus, release memory right away!! - if (bestConsensus != null) - bestConsensus.readIndexes.clear(); - bestConsensus = consensus; - //logger.debug("New consensus " + bestConsensus.cigar + " is now best consensus"); - } else { - // we do not need this alt consensus, release memory right away!! - consensus.readIndexes.clear(); - } - } - - // if: - // 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads, - // 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads, - // 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy), - // then clean! - final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0); - if ( improvement >= LOD_THRESHOLD ) { - - bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference, true); - - // start cleaning the appropriate reads - for ( Pair indexPair : bestConsensus.readIndexes ) { - AlignedRead aRead = altReads.get(indexPair.first); - if ( !updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, leftmostIndex) ) - return; - } - if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { - if ( statsOutput != null ) { - try { - statsOutput.write(currentInterval.toString()); - statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist) - statsOutput.write(Double.toString(improvement)); - statsOutput.write("\n"); - statsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); - } - } - } else { - //logger.debug("CLEAN: " + bestConsensus.cigar + " " + bestConsensus.str.toString() + " " + bestConsensus.cigar.numCigarElements() ); - if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { - // NOTE: indels are printed out in the format specified for the low-coverage pilot1 - // indel calls (tab-delimited): chr position size type sequence - StringBuilder str = new StringBuilder(); - str.append(reads.get(0).getReferenceName()); - int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength(); - str.append("\t").append(leftmostIndex + position - 1); - CigarElement ce = bestConsensus.cigar.getCigarElement(1); - str.append("\t").append(ce.getLength()).append("\t").append(ce.getOperator()).append("\t"); - int length = ce.getLength(); - if ( ce.getOperator() == CigarOperator.D ) { - for ( int i = 0; i < length; i++) - str.append((char)reference[position+i]); - } else { - for ( int i = 0; i < length; i++) - str.append((char)bestConsensus.str[position+i]); - } - str.append("\t").append((((double) (totalRawMismatchSum - bestConsensus.mismatchSum)) / 10.0)).append("\n"); - try { - indelOutput.write(str.toString()); - indelOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("indelOutput", "Failed to write indel output file", e); - } - } - if ( statsOutput != null ) { - try { - statsOutput.write(currentInterval.toString()); - statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced - if ( bestConsensus.cigar.numCigarElements() > 1 ) - statsOutput.write(" (found indel)"); - statsOutput.write("\t"); - statsOutput.write(Double.toString(improvement)); - statsOutput.write("\n"); - statsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); - } - } - - // finish cleaning the appropriate reads - for ( Pair indexPair : bestConsensus.readIndexes ) { - final AlignedRead aRead = altReads.get(indexPair.first); - if ( aRead.finalizeUpdate() ) { - // We need to update the mapping quality score of the cleaned reads; - // however we don't have enough info to use the proper MAQ scoring system. - // For now, we will just arbitrarily add 10 to the mapping quality. [EB, 6/7/2010]. - // TODO -- we need a better solution here - GATKSAMRecord read = aRead.getRead(); - if ( read.getMappingQuality() != 255 ) // 255 == Unknown, so don't modify it - read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254)); - - // before we fix the attribute tags we first need to make sure we have enough of the reference sequence - int neededBasesToLeft = leftmostIndex - read.getAlignmentStart(); - int neededBasesToRight = read.getAlignmentEnd() - leftmostIndex - reference.length + 1; - int neededBases = Math.max(neededBasesToLeft, neededBasesToRight); - if ( neededBases > 0 ) { - int padLeft = Math.max(leftmostIndex-neededBases, 1); - int padRight = Math.min(leftmostIndex+reference.length+neededBases, referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength()); - reference = referenceReader.getSubsequenceAt(currentInterval.getContig(), padLeft, padRight).getBases(); - leftmostIndex = padLeft; - } - - // now, fix the attribute tags - // TODO -- get rid of this try block when Picard does the right thing for reads aligned off the end of the reference - try { - if ( read.getAttribute(SAMTag.NM.name()) != null ) - read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex - 1)); - if ( read.getAttribute(SAMTag.UQ.name()) != null ) - read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, reference, leftmostIndex-1)); - } catch (Exception e) { - // ignore it - } - // TODO -- this is only temporary until Tim adds code to recalculate this value - if ( read.getAttribute(SAMTag.MD.name()) != null ) - read.setAttribute(SAMTag.MD.name(), null); - - // mark that it was actually cleaned - readsActuallyCleaned.add(read); - } - } - } - - // END IF ( improvement >= LOD_THRESHOLD ) - - } else if ( statsOutput != null ) { - try { - statsOutput.write(String.format("%s\tFAIL\t%.1f%n", - currentInterval.toString(), improvement)); - statsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); - } - } - } - - private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final int leftmostIndex, final byte[] reference) { - for ( VariantContext knownIndel : knownIndelsToTry ) { - if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) - continue; - final byte[] indelStr; - if ( knownIndel.isSimpleInsertion() ) { - final byte[] fullAllele = knownIndel.getAlternateAllele(0).getBases(); - indelStr = Arrays.copyOfRange(fullAllele, 1, fullAllele.length); // remove ref padding - } else { - indelStr = Utils.dupBytes((byte)'-', knownIndel.getReference().length() - 1); - } - int start = knownIndel.getStart() - leftmostIndex + 1; - Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); - if ( c != null ) - altConsensesToPopulate.add(c); - } - } - - private long determineReadsThatNeedCleaning(final List reads, - final ArrayList refReadsToPopulate, - final ArrayList altReadsToPopulate, - final LinkedList altAlignmentsToTest, - final Set altConsenses, - final int leftmostIndex, - final byte[] reference) { - - long totalRawMismatchSum = 0L; - for ( final GATKSAMRecord read : reads ) { - - // we can not deal with screwy records - if ( read.getCigar().numCigarElements() == 0 ) { - refReadsToPopulate.add(read); - continue; - } - - final AlignedRead aRead = new AlignedRead(read); - - // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence - int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); - if ( numBlocks == 2 ) { - Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-leftmostIndex, 0, true); - aRead.setCigar(newCigar, false); - } - - final int startOnRef = read.getAlignmentStart()-leftmostIndex; - final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); - - // if this doesn't match perfectly to the reference, let's try to clean it - if ( rawMismatchScore > 0 ) { - altReadsToPopulate.add(aRead); - //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads"); - - if ( !read.getDuplicateReadFlag() ) - totalRawMismatchSum += rawMismatchScore; - aRead.setMismatchScoreToReference(rawMismatchScore); - aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef)); - - // if it has an indel, let's see if that's the best consensus - if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && numBlocks == 2 ) { - Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases()); - if ( c != null ) - altConsenses.add(c); - } else { - altAlignmentsToTest.add(aRead); - } - } - // otherwise, we can emit it as is - else { - //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); - refReadsToPopulate.add(read); - } - } - - return totalRawMismatchSum; - } - - private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, - final Set altConsensesToPopulate, - final byte[] reference, - final int leftmostIndex) { - - // if we are under the limit, use all reads to generate alternate consenses - if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { - for ( AlignedRead aRead : altAlignmentsToTest ) { - if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); - else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); - } - } - // otherwise, choose reads for alternate consenses randomly - else { - int readsSeen = 0; - while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { - int index = GenomeAnalysisEngine.getRandomGenerator().nextInt(altAlignmentsToTest.size()); - AlignedRead aRead = altAlignmentsToTest.remove(index); - if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); - else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); - } - } - } - - private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { - - // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); - if ( c != null ) - altConsensesToPopulate.add(c); - } - - private void createAndAddAlternateConsensus1(AlignedRead read, final Set altConsensesToPopulate, - final byte[] reference, final int leftmostIndex) { - - for ( Consensus known : altConsensesToPopulate ) { - Pair altAlignment = findBestOffset(known.str, read, leftmostIndex); - // the mismatch score is the min of its alignment vs. the reference and vs. the alternate - int myScore = altAlignment.second; - if ( myScore == 0 ) {exactMatchesFound++; return; }// read matches perfectly to a known alt consensus - no need to run SW, we already know the answer - } - // do a pairwise alignment against the reference - SWalignmentRuns++; - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); - if ( c != null ) { - altConsensesToPopulate.add(c); - SWalignmentSuccess++; - } - } - - // create a Consensus from cigar/read strings which originate somewhere on the reference - private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) { - if ( indexOnRef < 0 ) - return null; - - // if there are no indels, we do not need this consensus, can abort early: - if ( c.numCigarElements() == 1 && c.getCigarElement(0).getOperator() == CigarOperator.M ) return null; - - // create the new consensus - ArrayList elements = new ArrayList(c.numCigarElements()-1); - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < indexOnRef; i++) - sb.append((char)reference[i]); - - int indelCount = 0; - int altIdx = 0; - int refIdx = indexOnRef; - boolean ok_flag = true; - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - int elementLength = ce.getLength(); - switch( ce.getOperator() ) { - case D: - refIdx += elementLength; - indelCount++; - elements.add(ce); - break; - case M: - case EQ: - case X: - altIdx += elementLength; - case N: - if ( reference.length < refIdx + elementLength ) - ok_flag = false; - else { - for (int j = 0; j < elementLength; j++) - sb.append((char)reference[refIdx+j]); - } - refIdx += elementLength; - elements.add(new CigarElement(elementLength, CigarOperator.M)); - break; - case I: - for (int j = 0; j < elementLength; j++) { - if ( ! BaseUtils.isRegularBase(readStr[altIdx+j]) ) { - // Insertions with N's in them cause real problems sometimes; it's better to drop them altogether - ok_flag=false; - break; - } - sb.append((char)readStr[altIdx + j]); - } - altIdx += elementLength; - indelCount++; - elements.add(ce); - break; - case S: - default: - break; - } - } - // make sure that there is at most only a single indel and it aligns appropriately! - if ( !ok_flag || indelCount != 1 || reference.length < refIdx ) - return null; - - for (int i = refIdx; i < reference.length; i++) - sb.append((char)reference[i]); - byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read - - return new Consensus(altConsensus, new Cigar(elements), indexOnRef); - } - - // create a Consensus from just the indel string that falls on the reference - private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) { - if ( indexOnRef < 0 || indexOnRef >= reference.length ) - return null; - - // create the new consensus - StringBuilder sb = new StringBuilder(); - Cigar cigar = new Cigar(); - int refIdx; - - for (refIdx = 0; refIdx < indexOnRef; refIdx++) - sb.append((char)reference[refIdx]); - if ( indexOnRef > 0 ) - cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); - - if ( indel.isSimpleDeletion() ) { - refIdx += indelStr.length; - cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); - } - else if ( indel.isSimpleInsertion() ) { - for ( byte b : indelStr ) - sb.append((char)b); - cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); - } else { - throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); - } - - if ( reference.length - refIdx > 0 ) - cigar.add(new CigarElement(reference.length - refIdx, CigarOperator.M)); - for (; refIdx < reference.length; refIdx++) - sb.append((char)reference[refIdx]); - byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read - - return new Consensus(altConsensus, cigar, 0); - } - - private Pair findBestOffset(final byte[] ref, final AlignedRead read, final int leftmostIndex) { - - // optimization: try the most likely alignment first (to get a low score to beat) - int originalAlignment = read.getOriginalAlignmentStart() - leftmostIndex; - int bestScore = mismatchQualitySumIgnoreCigar(read, ref, originalAlignment, Integer.MAX_VALUE); - int bestIndex = originalAlignment; - - // optimization: we can't get better than 0, so we can quit now - if ( bestScore == 0 ) - return new Pair(bestIndex, 0); - - // optimization: the correct alignment shouldn't be too far from the original one (or else the read wouldn't have aligned in the first place) - for ( int i = 0; i < originalAlignment; i++ ) { - int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); - if ( score < bestScore ) { - bestScore = score; - bestIndex = i; - } - // optimization: we can't get better than 0, so we can quit now - if ( bestScore == 0 ) - return new Pair(bestIndex, 0); - } - - final int maxPossibleStart = ref.length - read.getReadLength(); - for ( int i = originalAlignment + 1; i <= maxPossibleStart; i++ ) { - int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); - if ( score < bestScore ) { - bestScore = score; - bestIndex = i; - } - // optimization: we can't get better than 0, so we can quit now - if ( bestScore == 0 ) - return new Pair(bestIndex, 0); - } - - return new Pair(bestIndex, bestScore); - } - - - private boolean updateRead(final Cigar altCigar, final int altPosOnRef, final int myPosOnAlt, final AlignedRead aRead, final int leftmostIndex) { - Cigar readCigar = new Cigar(); - - // special case: there is no indel - if ( altCigar.getCigarElements().size() == 1 ) { - aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); - readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); - aRead.setCigar(readCigar); - return true; - } - - CigarElement altCE1 = altCigar.getCigarElement(0); - CigarElement altCE2 = altCigar.getCigarElement(1); - - int leadingMatchingBlockLength = 0; // length of the leading M element or 0 if the leading element is I - - CigarElement indelCE; - if ( altCE1.getOperator() == CigarOperator.I ) { - indelCE=altCE1; - if ( altCE2.getOperator() != CigarOperator.M ) { - logger.warn("When the first element of the alt consensus is I, the second one must be M. Actual: " + altCigar.toString() + ". Skipping this site..."); - return false; - } - } - else { - if ( altCE1.getOperator() != CigarOperator.M ) { - logger.warn("First element of the alt consensus cigar must be M or I. Actual: " + altCigar.toString() + ". Skipping this site..."); - return false; - } - if ( altCE2.getOperator() == CigarOperator.I || altCE2.getOperator() == CigarOperator.D ) { - indelCE=altCE2; - } else { - logger.warn("When first element of the alt consensus is M, the second one must be I or D. Actual: " + altCigar.toString() + ". Skipping this site..."); - return false; - } - leadingMatchingBlockLength = altCE1.getLength(); - } - - // the easiest thing to do is to take each case separately - int endOfFirstBlock = altPosOnRef + leadingMatchingBlockLength; - boolean sawAlignmentStart = false; - - // for reads starting before the indel - if ( myPosOnAlt < endOfFirstBlock) { - aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); - sawAlignmentStart = true; - - // for reads ending before the indel - if ( myPosOnAlt + aRead.getReadLength() <= endOfFirstBlock) { - //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); - //aRead.setCigar(readCigar); - aRead.setCigar(null); // reset to original alignment - return true; - } - readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M)); - } - - // forward along the indel - //int indelOffsetOnRef = 0, indelOffsetOnRead = 0; - if ( indelCE.getOperator() == CigarOperator.I ) { - // for reads that end in an insertion - if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + indelCE.getLength() ) { - int partialInsertionLength = myPosOnAlt + aRead.getReadLength() - endOfFirstBlock; - // if we also started inside the insertion, then we need to modify the length - if ( !sawAlignmentStart ) - partialInsertionLength = aRead.getReadLength(); - readCigar.add(new CigarElement(partialInsertionLength, CigarOperator.I)); - aRead.setCigar(readCigar); - return true; - } - - // for reads that start in an insertion - if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + indelCE.getLength() ) { - aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock); - readCigar.add(new CigarElement(indelCE.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I)); - //indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; - sawAlignmentStart = true; - } else if ( sawAlignmentStart ) { - readCigar.add(indelCE); - //indelOffsetOnRead = indelCE.getLength(); - } - } else if ( indelCE.getOperator() == CigarOperator.D ) { - if ( sawAlignmentStart ) - readCigar.add(indelCE); - //indelOffsetOnRef = indelCE.getLength(); - } - - // for reads that start after the indel - if ( !sawAlignmentStart ) { - //aRead.setAlignmentStart(leftmostIndex + myPosOnAlt + indelOffsetOnRef - indelOffsetOnRead); - //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); - //aRead.setCigar(readCigar); - aRead.setCigar(null); // reset to original alignment - return true; - } - - int readRemaining = aRead.getReadBases().length; - for ( CigarElement ce : readCigar.getCigarElements() ) { - if ( ce.getOperator() != CigarOperator.D ) - readRemaining -= ce.getLength(); - } - if ( readRemaining > 0 ) - readCigar.add(new CigarElement(readRemaining, CigarOperator.M)); - aRead.setCigar(readCigar); - - return true; - } - - private boolean alternateReducesEntropy(final List reads, final byte[] reference, final int leftmostIndex) { - final int[] originalMismatchBases = new int[reference.length]; - final int[] cleanedMismatchBases = new int[reference.length]; - final int[] totalOriginalBases = new int[reference.length]; - final int[] totalCleanedBases = new int[reference.length]; - - // set to 1 to prevent dividing by zero - for ( int i=0; i < reference.length; i++ ) - originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 0; - - for (final AlignedRead read : reads) { - if (read.getRead().getAlignmentBlocks().size() > 1) - continue; - - int refIdx = read.getOriginalAlignmentStart() - leftmostIndex; - final byte[] readStr = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - - for (int j = 0; j < readStr.length; j++, refIdx++) { - if (refIdx < 0 || refIdx >= reference.length) { - //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() ); - //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); - break; - } - totalOriginalBases[refIdx] += quals[j]; - if (readStr[j] != reference[refIdx]) - originalMismatchBases[refIdx] += quals[j]; - } - - // reset and now do the calculation based on the cleaning - refIdx = read.getAlignmentStart() - leftmostIndex; - int altIdx = 0; - Cigar c = read.getCigar(); - for (int j = 0; j < c.numCigarElements(); j++) { - CigarElement ce = c.getCigarElement(j); - int elementLength = ce.getLength(); - switch (ce.getOperator()) { - case M: - case EQ: - case X: - for (int k = 0; k < elementLength; k++, refIdx++, altIdx++) { - if (refIdx >= reference.length) - break; - totalCleanedBases[refIdx] += quals[altIdx]; - if (readStr[altIdx] != reference[refIdx]) - cleanedMismatchBases[refIdx] += quals[altIdx]; - } - break; - case I: - altIdx += elementLength; - break; - case D: - refIdx += elementLength; - break; - case S: - default: - break; - } - } - } - - int originalMismatchColumns = 0, cleanedMismatchColumns = 0; - StringBuilder sb = new StringBuilder(); - for ( int i=0; i < reference.length; i++ ) { - if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) - continue; - boolean didMismatch = false, stillMismatches = false; - if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD ) { - didMismatch = true; - originalMismatchColumns++; - if ( totalCleanedBases[i] > 0 && ((double)cleanedMismatchBases[i] / (double)totalCleanedBases[i]) > ((double)originalMismatchBases[i] / (double)totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { - stillMismatches = true; - cleanedMismatchColumns++; - } - } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) { - cleanedMismatchColumns++; - } - if ( snpsOutput != null ) { - if ( didMismatch ) { - sb.append(reads.get(0).getRead().getReferenceName()).append(":").append(leftmostIndex + i); - if ( stillMismatches ) - sb.append(" SAME_SNP\n"); - else - sb.append(" NOT_SNP\n"); - } - } - } - - //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); - - final boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); - if ( reduces && snpsOutput != null ) { - try { - snpsOutput.write(sb.toString()); - snpsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("snpsOutput", "Failed to write SNPs output file", e); - } - } - return reduces; - } - - protected static Cigar unclipCigar(Cigar cigar) { - ArrayList elements = new ArrayList(cigar.numCigarElements()); - for ( CigarElement ce : cigar.getCigarElements() ) { - if ( !isClipOperator(ce.getOperator()) ) - elements.add(ce); - } - return new Cigar(elements); - } - - private static boolean isClipOperator(CigarOperator op) { - return op == CigarOperator.S || op == CigarOperator.H || op == CigarOperator.P; - } - - protected static Cigar reclipCigar(Cigar cigar, SAMRecord read) { - ArrayList elements = new ArrayList(); - - int i = 0; - int n = read.getCigar().numCigarElements(); - while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) - elements.add(read.getCigar().getCigarElement(i++)); - - elements.addAll(cigar.getCigarElements()); - - i++; - while ( i < n && !isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) - i++; - - while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) - elements.add(read.getCigar().getCigarElement(i++)); - - return new Cigar(elements); - } - - private class AlignedRead { - private final GATKSAMRecord read; - private byte[] readBases = null; - private byte[] baseQuals = null; - private Cigar newCigar = null; - private int newStart = -1; - private int mismatchScoreToReference = 0; - private long alignerMismatchScore = 0; - - public AlignedRead(GATKSAMRecord read) { - this.read = read; - mismatchScoreToReference = 0; - } - - public GATKSAMRecord getRead() { - return read; - } - - public int getReadLength() { - return readBases != null ? readBases.length : read.getReadLength(); - } - - public byte[] getReadBases() { - if ( readBases == null ) - getUnclippedBases(); - return readBases; - } - - public byte[] getBaseQualities() { - if ( baseQuals == null ) - getUnclippedBases(); - return baseQuals; - } - - // pull out the bases that aren't clipped out - private void getUnclippedBases() { - readBases = new byte[getReadLength()]; - baseQuals = new byte[getReadLength()]; - byte[] actualReadBases = read.getReadBases(); - byte[] actualBaseQuals = read.getBaseQualities(); - int fromIndex = 0, toIndex = 0; - - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - fromIndex += elementLength; - break; - case M: - case EQ: - case X: - case I: - System.arraycopy(actualReadBases, fromIndex, readBases, toIndex, elementLength); - System.arraycopy(actualBaseQuals, fromIndex, baseQuals, toIndex, elementLength); - fromIndex += elementLength; - toIndex += elementLength; - default: - break; - } - } - - // if we got clipped, trim the array - if ( fromIndex != toIndex ) { - byte[] trimmedRB = new byte[toIndex]; - byte[] trimmedBQ = new byte[toIndex]; - System.arraycopy(readBases, 0, trimmedRB, 0, toIndex); - System.arraycopy(baseQuals, 0, trimmedBQ, 0, toIndex); - readBases = trimmedRB; - baseQuals = trimmedBQ; - } - } - - public Cigar getCigar() { - return (newCigar != null ? newCigar : read.getCigar()); - } - - public void setCigar(Cigar cigar) { - setCigar(cigar, true); - } - - // tentatively sets the new Cigar, but it needs to be confirmed later - public void setCigar(Cigar cigar, boolean fixClippedCigar) { - if ( cigar == null ) { - newCigar = null; - return; - } - - if ( fixClippedCigar && getReadBases().length < read.getReadLength() ) - cigar = reclipCigar(cigar); - - // no change? - if ( read.getCigar().equals(cigar) ) { - newCigar = null; - return; - } - - // no indel? - String str = cigar.toString(); - if ( !str.contains("D") && !str.contains("I") ) { - logger.debug("Modifying a read with no associated indel; although this is possible, it is highly unlikely. Perhaps this region should be double-checked: " + read.getReadName() + " near " + read.getReferenceName() + ":" + read.getAlignmentStart()); - // newCigar = null; - // return; - } - - newCigar = cigar; - } - - // pull out the bases that aren't clipped out - private Cigar reclipCigar(Cigar cigar) { - return IndelRealigner.reclipCigar(cigar, read); - } - - // tentatively sets the new start, but it needs to be confirmed later - public void setAlignmentStart(int start) { - newStart = start; - } - - public int getAlignmentStart() { - return (newStart != -1 ? newStart : read.getAlignmentStart()); - } - - public int getOriginalAlignmentStart() { - return read.getAlignmentStart(); - } - - // finalizes the changes made. - // returns true if this record actually changes, false otherwise - public boolean finalizeUpdate() { - // if we haven't made any changes, don't do anything - if ( newCigar == null ) - return false; - if ( newStart == -1 ) - newStart = read.getAlignmentStart(); - else if ( Math.abs(newStart - read.getAlignmentStart()) > MAX_POS_MOVE_ALLOWED ) { - logger.debug(String.format("Attempting to realign read %s at %d more than %d bases to %d.", read.getReadName(), read.getAlignmentStart(), MAX_POS_MOVE_ALLOWED, newStart)); - return false; - } - - // annotate the record with the original cigar (and optionally the alignment start) - if ( !NO_ORIGINAL_ALIGNMENT_TAGS ) { - read.setAttribute(ORIGINAL_CIGAR_TAG, read.getCigar().toString()); - if ( newStart != read.getAlignmentStart() ) - read.setAttribute(ORIGINAL_POSITION_TAG, read.getAlignmentStart()); - } - - read.setCigar(newCigar); - read.setAlignmentStart(newStart); - - return true; - } - - public void setMismatchScoreToReference(int score) { - mismatchScoreToReference = score; - } - - public int getMismatchScoreToReference() { - return mismatchScoreToReference; - } - - public void setAlignerMismatchScore(long score) { - alignerMismatchScore = score; - } - - public long getAlignerMismatchScore() { - return alignerMismatchScore; - } - } - - private static class Consensus { - public final byte[] str; - public final ArrayList> readIndexes; - public final int positionOnReference; - public int mismatchSum; - public Cigar cigar; - - public Consensus(byte[] str, Cigar cigar, int positionOnReference) { - this.str = str; - this.cigar = cigar; - this.positionOnReference = positionOnReference; - mismatchSum = 0; - readIndexes = new ArrayList>(); - } - - @Override - public boolean equals(Object o) { - return ( this == o || (o instanceof Consensus && Arrays.equals(this.str,(((Consensus)o).str)) ) ); - } - - public boolean equals(Consensus c) { - return ( this == c || Arrays.equals(this.str,c.str) ) ; - } - - @Override - public int hashCode() { - return Arrays.hashCode(this.str); - } - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java deleted file mode 100644 index a273cf01d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ /dev/null @@ -1,521 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; -import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.Map; - - -public class PairHMMIndelErrorModel { - public static final int BASE_QUAL_THRESHOLD = 20; - - private boolean DEBUG = false; - - private static final int MAX_CACHED_QUAL = 127; - - private static final double baseMatchArray[]; - private static final double baseMismatchArray[]; - - private static final int START_HRUN_GAP_IDX = 4; - private static final int MAX_HRUN_GAP_IDX = 20; - - private static final byte MIN_GAP_OPEN_PENALTY = 30; - private static final byte MIN_GAP_CONT_PENALTY = 10; - private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. - - private final byte[] GAP_OPEN_PROB_TABLE; - private final byte[] GAP_CONT_PROB_TABLE; - - private final PairHMM pairHMM; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - - static { - baseMatchArray = new double[MAX_CACHED_QUAL+1]; - baseMismatchArray = new double[MAX_CACHED_QUAL+1]; - for (int k=1; k <= MAX_CACHED_QUAL; k++) { - double baseProb = Math.pow(10, -k/10.); - - - baseMatchArray[k] = Math.log10(1-baseProb); - baseMismatchArray[k] = Math.log10(baseProb); - } - } - - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { - this.DEBUG = deb; - - switch (hmmType) { - case EXACT: - pairHMM = new Log10PairHMM(true); - break; - case ORIGINAL: - pairHMM = new Log10PairHMM(false); - break; - case LOGLESS_CACHING: - pairHMM = new LoglessPairHMM(); - break; - case ARRAY_LOGLESS: - pairHMM = new ArrayLoglessPairHMM(); - break; - default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS."); - } - - // fill gap penalty table, affine naive model: - this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = indelGOP; - GAP_CONT_PROB_TABLE[i] = indelGCP; - } - - double step = GAP_PENALTY_HRUN_STEP/10.0; - - // initialize gop and gcp to their default values - byte gop = indelGOP; - byte gcp = indelGCP; - - // all of the following is computed in QUal-space - for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { - gop -= GAP_PENALTY_HRUN_STEP; - if (gop < MIN_GAP_OPEN_PENALTY) - gop = MIN_GAP_OPEN_PENALTY; - - gcp -= step; - if(gcp < MIN_GAP_CONT_PENALTY) - gcp = MIN_GAP_CONT_PENALTY; - GAP_OPEN_PROB_TABLE[i] = gop; - GAP_CONT_PROB_TABLE[i] = gcp; - } - - } - - static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { - // compute forward hrun length, example: - // AGGTGACCCCCCTGAGAG - // 001000012345000000 - hrunArray[0] = 0; - int[] hforward = new int[hrunArray.length]; - int[] hreverse = new int[hrunArray.length]; - - for (int i = 1; i < refBytes.length; i++) { - if (refBytes[i] == refBytes[i-1]) - hforward[i] = hforward[i-1]+1; - else - hforward[i] = 0; - } - - // do similar thing for reverse length, example: - // AGGTGACCCCCCTGAGAG - // 021000543210000000 - // and then accumulate with forward values. - // Total: - // AGGTGACCCCCCTGAGAG - // 022000555555000000 - for (int i=refBytes.length-1; i > 0; i--) { - if (refBytes[i-1] == refBytes[i]) - hreverse[i-1] += hreverse[i]+1; - } - - for (int i = 1; i < refBytes.length; i++) - hrunArray[i] = hforward[i]+hreverse[i]; - } - - - private void fillGapProbabilities(final int[] hrunProfile, - final byte[] contextLogGapOpenProbabilities, - final byte[] contextLogGapContinuationProbabilities) { - // fill based on lookup table - for (int i = 0; i < hrunProfile.length; i++) { - if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { - contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; - contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; - } - else { - contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]]; - contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]]; - } - } - } - - private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap, - long startLocationInRefForHaplotypes, - long stopLocationInRefForHaplotypes, - final ReferenceContext ref){ - - final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); - for (final Allele a: haplotypeMap.keySet()) { - - final Haplotype haplotype = haplotypeMap.get(a); - - if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) - stopLocationInRefForHaplotypes = haplotype.getStopPosition(); - - if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) - startLocationInRefForHaplotypes = haplotype.getStartPosition(); - else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) - startLocationInRefForHaplotypes = haplotype.getStopPosition(); - - final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); - final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - - if (DEBUG) - System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - - // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map - final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); - trimmedHaplotypeMap.put(a, trimmedHaplotype); - } - return trimmedHaplotypeMap; - } - - - public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, - final LinkedHashMap haplotypeMap, - final ReferenceContext ref, - final int eventLength, - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final double downsamplingFraction) { - final int numHaplotypes = haplotypeMap.size(); - - final int readCounts[] = new int[pileup.getNumberOfElements()]; - final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); - return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); - - } - - /** - * Should we clip a downstream portion of a read because it spans off the end of a haplotype? - * - * @param read the read in question - * @param refWindowStop the end of the reference window - * @return true if the read needs to be clipped, false otherwise - */ - protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) { - return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() > refWindowStop ); - } - - /** - * Should we clip a upstream portion of a read because it spans off the end of a haplotype? - * - * @param read the read in question - * @param refWindowStart the start of the reference window - * @return true if the read needs to be clipped, false otherwise - */ - protected static boolean mustClipUpstream(final GATKSAMRecord read, final int refWindowStart) { - return ( !read.isEmpty() && read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart ); - } - - @Ensures("result != null && result.length == pileup.getNumberOfElements()") - public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, - final LinkedHashMap haplotypeMap, - final ReferenceContext ref, - final int eventLength, - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final int[] readCounts) { - final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - - final LinkedList readList = new LinkedList<>(); - final Map readGCPArrayMap = new LinkedHashMap<>(); - int readIdx=0; - for (PileupElement p: pileup) { - // > 1 when the read is a consensus read representing multiple independent observations - readCounts[readIdx] = p.getRepresentativeCount(); - - // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) - if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { - Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); - int j=0; - for (Allele a: haplotypeMap.keySet()) { - readLikelihoods[readIdx][j++] = el.get(a); - } - } - else { - // extra padding on candidate haplotypes to make sure reads are always strictly contained - // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly - // in case bases at edge of reads have lower quality. - final int trailingBases = 3; - final int refWindowStart = ref.getWindow().getStart() + trailingBases; - final int refWindowStop = ref.getWindow().getStop() - trailingBases; - - if (DEBUG) { - System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); - } - - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - - // if the read extends beyond the downstream (right) end of the reference window, clip it - if ( mustClipDownstream(read, refWindowStop) ) - read = ReadClipper.hardClipByReadCoordinates(read, read.getSoftStart() + read.getReadLength() - refWindowStop + 1, read.getReadLength() - 1); - - // if the read extends beyond the upstream (left) end of the reference window, clip it - if ( mustClipUpstream(read, refWindowStart) ) - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart); - - if (read.isEmpty()) - continue; - - // hard-clip low quality ends - this may introduce extra H elements in CIGAR string - read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD ); - - if (read.isEmpty()) - continue; - - // get bases of candidate haplotypes that overlap with reads - final long readStart = read.getSoftStart(); - final long readEnd = read.getSoftEnd(); - - // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match, - // but they're actually consistent with the insertion! - // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning. - // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read. - final long eventStartPos = ref.getLocus().getStart(); - - // compute total number of clipped bases (soft or hard clipped) and only use them if necessary - final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength); - final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0; - final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ; - final byte [] unclippedReadBases = read.getReadBases(); - final byte [] unclippedReadQuals = read.getBaseQualities(); - - /** - * Compute genomic locations that candidate haplotypes will span. - * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, - * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. - * We will propose haplotypes that overlap the read with some padding. - * True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read) - * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. - * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to - * differentiate context between two haplotypes - */ - final int absEventLength = Math.abs(eventLength); - long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0); - long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength; - - if (DEBUG) - System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - - int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; - - if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { - startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; - } - else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { - startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; - } - - // candidate haplotype cannot go beyond reference context - if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { - stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context - } - - if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { - stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap - } - - // ok, we now figured out the total number of clipped bases on both ends. - // Figure out where we want to place the haplotype to score read against - - if (DEBUG) - System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); - - // LinkedHashMap readEl = new LinkedHashMap(); - - /** - * Check if we'll end up with an empty read once all clipping is done - */ - if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { - int j=0; - for (Allele a: haplotypeMap.keySet()) { - perReadAlleleLikelihoodMap.add(p,a,0.0); - readLikelihoods[readIdx][j++] = 0.0; - } - } - else { - final int endOfCopy = unclippedReadBases.length - numEndSoftClippedBases; - final byte[] readBases = Arrays.copyOfRange(unclippedReadBases, numStartSoftClippedBases, endOfCopy); - final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals, numStartSoftClippedBases, endOfCopy); - - int j=0; - - final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; - final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; - - // get homopolymer length profile for current haplotype - final int[] hrunProfile = new int[readBases.length]; - getContextHomopolymerLength(readBases,hrunProfile); - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - // get the base insertion and deletion qualities to use - final byte[] baseInsertionQualities, baseDeletionQualities; - if ( read.hasBaseIndelQualities() ) { - baseInsertionQualities = Arrays.copyOfRange(read.getBaseInsertionQualities(), numStartSoftClippedBases, endOfCopy); - baseDeletionQualities = Arrays.copyOfRange(read.getBaseDeletionQualities(), numStartSoftClippedBases, endOfCopy); - } else { - baseInsertionQualities = contextLogGapOpenProbabilities; - baseDeletionQualities = contextLogGapOpenProbabilities; - } - - // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM - final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); - readList.add(processedRead); - - // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM - readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); - - // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations - final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); - - // Get the likelihoods for our clipped read against each of our trimmed haplotypes. - final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); - - // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array - for (Allele a: trimmedHaplotypeMap.keySet()){ - double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); - perReadAlleleLikelihoodMap.add(p, a, readLikelihood); - readLikelihoods[readIdx][j++] = readLikelihood; - } - // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually - readList.remove(processedRead); - - // The same is true for the read/GCP-array map - readGCPArrayMap.remove(processedRead); - } - } - readIdx++; - } - - if (DEBUG) { - System.out.println("\nLikelihood summary"); - for (readIdx=0; readIdx < pileup.getNumberOfElements(); readIdx++) { - System.out.format("Read Index: %d ",readIdx); - for (int i=0; i < readLikelihoods[readIdx].length; i++) - System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); - System.out.println(); - } - - } - - return readLikelihoods; - } - - private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { - return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); - } - -// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { -// if (b1.length != b2.length) -// return 0; // sanity check -// -// for (int i=0; i < b1.length; i++ ){ -// if ( b1[i]!= b2[i] ) -// return i; -// } -// return b1.length; -// } - - private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { - final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; - - // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix - for (int i=0; i < numHaplotypes; i++) { - for (int j=i; j < numHaplotypes; j++){ - // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] - // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) - //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) - for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j])) - continue; - final double li = readLikelihoods[readIdx][i]; - final double lj = readLikelihoods[readIdx][j]; - final int readCount = readCounts[readIdx]; - haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF); - } - } - } - - final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; - int k=0; - for (int j=0; j < numHaplotypes; j++) { - for (int i=0; i <= j; i++){ - genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; - } - } - - // renormalize so that max element is zero. - return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java deleted file mode 100644 index 688f05934..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java +++ /dev/null @@ -1,94 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Arrays; - -class Haplotype extends BaseArray implements Cloneable { - public Haplotype(byte[] bases) { - super(bases); - } - - private Haplotype(Byte[] bases) { - super(bases); - } - - public Haplotype(Haplotype other) { - super(other); - } - - public Haplotype(BaseArray baseArr) { - super(baseArr.bases); - - if (baseArr.getNonNullIndices().length != baseArr.bases.length) - throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!"); - } - - public void updateBase(int index, Byte base) { - if (base == null) { - throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!"); - } - super.updateBase(index, base); - } - - public Haplotype clone() { - try { - super.clone(); - } catch (CloneNotSupportedException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - } - return new Haplotype(this); - } - - // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. - - public Haplotype subHaplotype(int fromIndex, int toIndex) { - return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java deleted file mode 100644 index 2a31b5425..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ /dev/null @@ -1,989 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.PrintStream; -import java.util.*; - -/** - * Computes the most likely genotype combination and phases trios and parent/child pairs - * - *

- * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases - * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. - * Ambiguous sites are: - *

    - *
  • Sites where all individuals are heterozygous
  • - *
  • Sites where there is a Mendelian violation
  • - *
- * Missing genotypes are handled as follows: - *
    - *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • - *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • - *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.
  • - *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • - *
- * - *

Input

- *

- *

    - *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • - *
  • A PED pedigree file containing the description of the individuals relationships.
  • - *
- *

- * - *

Options

- *

- *

    - *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype - * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, - * and each individual genotype, depth, allelic depth and likelihoods.
  • - *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • - *
- *

- * - *

Output

- *

- * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T PhaseByTransmission \
- *   -V input.vcf \
- *   -ped input.ped \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -public class PhaseByTransmission extends RodWalker, HashMap> { - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") - private PrintStream mvFile = null; - - @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") - private double deNovoPrior=1e-8; - - @Argument(shortName = "fatherAlleleFirst",required = false,fullName = "FatherAlleleFirst", doc="Ouputs the father allele as the first allele in phased child genotype. i.e. father|mother rather than mother|father.") - private boolean fatherFAlleleFirst=false; - - @Output - protected VariantContextWriter vcfWriter = null; - - private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; - private final String SOURCE_NAME = "PhaseByTransmission"; - - public final double NO_TRANSMISSION_PROB = -1.0; - - private ArrayList trios = new ArrayList(); - - //Matrix of priors for all genotype combinations - private EnumMap>> mvCountMatrix; - - //Matrix of allele transmission - private EnumMap>> transmissionMatrix; - - //Metrics counters hash keys - private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; - private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; - private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; - private final Byte NUM_TRIO_HET_HET_HET = 3; - private final Byte NUM_TRIO_VIOLATIONS = 4; - private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; - private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; - private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; - private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; - private final Byte NUM_PAIR_HET_HET = 8; - private final Byte NUM_PAIR_VIOLATIONS = 9; - private final Byte NUM_GENOTYPES_MODIFIED = 11; - - //Random number generator - private Random rand = new Random(); - - private enum FamilyMember { - MOTHER, - FATHER, - CHILD - } - - //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. - //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. - private class TrioPhase { - - //Create 2 fake alleles - //The actual bases will never be used but the Genotypes created using the alleles will be. - private final Allele REF = Allele.create("A",true); - private final Allele VAR = Allele.create("A",false); - private final Allele NO_CALL = Allele.create(".",false); - private final String DUMMY_NAME = "DummySample"; - - private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); - - private ArrayList getAlleles(GenotypeType genotype){ - ArrayList alleles = new ArrayList(2); - if(genotype == GenotypeType.HOM_REF){ - alleles.add(REF); - alleles.add(REF); - } - else if(genotype == GenotypeType.HET){ - alleles.add(REF); - alleles.add(VAR); - } - else if(genotype == GenotypeType.HOM_VAR){ - alleles.add(VAR); - alleles.add(VAR); - } - else{ - return null; - } - return alleles; - } - - private boolean isPhasable(GenotypeType genotype){ - return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR; - } - - //Create a new Genotype based on information from a single individual - //Homozygous genotypes will be set as phased, heterozygous won't be - private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){ - boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR; - trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase)); - } - - private Genotype makeGenotype(final GenotypeType type, boolean phase) { - return makeGenotype(getAlleles(type), phase); - } - - private Genotype makeGenotype(final List alleles, boolean phase) { - final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles); - gb.phased(phase); - return gb.make(); - } - - //Find the phase for a parent/child pair - private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){ - - //Special case for Het/Het as it is ambiguous - if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){ - trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); - return; - } - - ArrayList parentAlleles = getAlleles(parentGenotype); - ArrayList childAlleles = getAlleles(childGenotype); - ArrayList parentPhasedAlleles = new ArrayList(2); - ArrayList childPhasedAlleles = new ArrayList(2); - - //If there is a possible phasing between the parent and child => phase - int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); - if(childTransmittedAlleleIndex > -1){ - trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - if(parent.equals(FamilyMember.MOTHER)) - childPhasedAlleles.add(childAlleles.get(0)); - else - childPhasedAlleles.add(0,childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); - } - else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - parentPhasedAlleles.add(parentAlleles.get(1)); - parentPhasedAlleles.add(parentAlleles.get(0)); - trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - if(parent.equals(FamilyMember.MOTHER)) - childPhasedAlleles.add(childAlleles.get(0)); - else - childPhasedAlleles.add(0,childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); - } - //This is a Mendelian Violation => Do not phase - else{ - trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); - } - } - - //Phases a family by transmission - private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){ - - Set> possiblePhasedChildGenotypes = new HashSet>(); - ArrayList motherAlleles = getAlleles(mother); - ArrayList fatherAlleles = getAlleles(father); - ArrayList childAlleles = getAlleles(child); - - //Build all possible child genotypes for the given parent's genotypes - for (Allele momAllele : motherAlleles) { - for (Allele fatherAllele : fatherAlleles) { - ArrayList possiblePhasedChildAlleles = new ArrayList(2); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(fatherAllele); - possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); - } - } - - for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { - int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); - int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); - //If a possible combination has been found, create the genotypes - if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { - //Create mother's genotype - ArrayList motherPhasedAlleles = new ArrayList(2); - motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); - if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) - motherPhasedAlleles.add(motherAlleles.get(0)); - else - motherPhasedAlleles.add(motherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true)); - - //Create father's genotype - ArrayList fatherPhasedAlleles = new ArrayList(2); - fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); - if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) - fatherPhasedAlleles.add(fatherAlleles.get(0)); - else - fatherPhasedAlleles.add(fatherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true)); - - //Create child's genotype - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true)); - - //Once a phased combination is found; exit - return; - } - } - - //If this is reached then no phasing could be found - trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false)); - trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false)); - } - - /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. - If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair - or single individual. - */ - public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){ - - //Take care of cases where one or more family members are no call - if(!isPhasable(child)){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - else if(!isPhasable(mother)){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - if(!isPhasable(father)){ - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - else - phasePairAlleles(father, child, FamilyMember.FATHER); - } - else if(!isPhasable(father)){ - phasePairAlleles(mother, child, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - } - //Special case for Het/Het/Het as it is ambiguous - else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - //All family members have genotypes and at least one of them is not Het - else{ - phaseFamilyAlleles(mother, father, child); - } - - //If child should phased genotype should be father first, then swap the alleles - if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){ - ArrayList childAlleles = new ArrayList(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles()); - childAlleles.add(childAlleles.remove(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true)); - } - - } - - /** - * Applies the trio genotype combination to the given trio. - * @param ref: Reference allele - * @param alt: Alternate allele - * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination - * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination - * @param childGenotype: Genotype of the child to phase using this trio genotype combination - * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) - * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child - */ - public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ - phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); - } - - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ - - int phredScoreTransmission = -1; - if(transmissionProb != NO_TRANSMISSION_PROB){ - double dphredScoreTransmission = QualityUtils.phredScaleLog10ErrorRate(Math.log10(1 - (transmissionProb))); - phredScoreTransmission = dphredScoreTransmission < Byte.MAX_VALUE ? (byte)dphredScoreTransmission : Byte.MAX_VALUE; - } - //Handle null, missing and unavailable genotypes - //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable - //genotype so it is safe to return the original genotype in this case. - //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. - if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) - return genotype; - - //Add the transmission probability - Map genotypeAttributes = new HashMap(); - genotypeAttributes.putAll(genotype.getExtendedAttributes()); - if(transmissionProb>NO_TRANSMISSION_PROB) - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); - - ArrayList phasedAlleles = new ArrayList(2); - for(Allele allele : phasedGenotype.getAlleles()){ - if(allele.isReference()) - phasedAlleles.add(refAllele); - else if(allele.isNonReference()) - phasedAlleles.add(altAllele); - //At this point there should not be any other alleles left - else - throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - - } - - //Compute the new Log10Error if the genotype is different from the original genotype - double log10Error; - if(genotype.getType() == phasedGenotype.getType()) - log10Error = genotype.getLog10PError(); - else - log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); - - return new GenotypeBuilder(genotype).alleles(phasedAlleles) - .log10PError(log10Error) - .attributes(genotypeAttributes) - .phased(phasedGenotype.isPhased()).make(); - } - - - } - - /** - * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer - */ - public void initialize() { - ArrayList rodNames = new ArrayList(); - rodNames.add(variantCollection.variants.getName()); - Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - - //Get the trios from the families passed as ped - setTrios(); - if(trios.size()<1) - throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); - - - Set headerLines = new HashSet(); - headerLines.addAll(GATKVCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); - headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); - vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); - - buildMatrices(); - - if(mvFile != null) - mvFile.println("CHROM\tPOS\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_AD\tMOTHER_PL\tFATHER_GT\tFATHER_DP\tFATHER_AD\tFATHER_PL\tCHILD_GT\tCHILD_DP\tCHILD_AD\tCHILD_PL"); - - } - - /** - * Select trios and parent/child pairs only - */ - private void setTrios(){ - - Map> families = this.getSampleDB().getFamilies(); - Set family; - ArrayList parents; - for(Map.Entry> familyEntry : families.entrySet()){ - family = familyEntry.getValue(); - if(family.size()<2 || family.size()>3){ - logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey(),family.size())); - } - else{ - for(Sample familyMember : family){ - parents = familyMember.getParents(); - if(parents.size()>0){ - if(family.containsAll(parents)) - this.trios.add(familyMember); - else - logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey())); - break; - } - } - } - - } - - - - } - - //Create the transmission matrices - private void buildMatrices(){ - mvCountMatrix = new EnumMap>>(GenotypeType.class); - transmissionMatrix = new EnumMap>>(GenotypeType.class); - for(GenotypeType mother : GenotypeType.values()){ - mvCountMatrix.put(mother,new EnumMap>(GenotypeType.class)); - transmissionMatrix.put(mother,new EnumMap>(GenotypeType.class)); - for(GenotypeType father : GenotypeType.values()){ - mvCountMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); - transmissionMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); - for(GenotypeType child : GenotypeType.values()){ - mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); - transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); - } - } - } - } - - //Returns the number of Mendelian Violations for a given genotype combination. - //If one of the parents genotype is missing, it will consider it as a parent/child pair - //If the child genotype or both parents genotypes are missing, 0 is returned. - private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){ - - //Child is no call => No MV - if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE) - return 0; - //Add parents with genotypes for the evaluation - ArrayList parents = new ArrayList(); - if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE)) - parents.add(mother); - if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE)) - parents.add(father); - - //Both parents no calls => No MV - if (parents.isEmpty()) - return 0; - - //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed - int parentsNumRefAlleles = 0; - int parentsNumAltAlleles = 0; - - for(GenotypeType parent : parents){ - if(parent == GenotypeType.HOM_REF){ - parentsNumRefAlleles++; - } - else if(parent == GenotypeType.HET){ - parentsNumRefAlleles++; - parentsNumAltAlleles++; - } - else if(parent == GenotypeType.HOM_VAR){ - parentsNumAltAlleles++; - } - } - - //Case Child is HomRef - if(child == GenotypeType.HOM_REF){ - if(parentsNumRefAlleles == parents.size()) - return 0; - else return (parents.size()-parentsNumRefAlleles); - } - - //Case child is HomVar - if(child == GenotypeType.HOM_VAR){ - if(parentsNumAltAlleles == parents.size()) - return 0; - else return parents.size()-parentsNumAltAlleles; - } - - //Case child is Het - if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) - return 0; - - //MV - return 1; - } - - //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. - private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){ - int count = 0; - if(motherOriginal!=motherNew) - count++; - if(fatherOriginal!=fatherNew) - count++; - if(childOriginal!=childNew) - count++; - return count; - } - - //Get a Map of genotype likelihoods. - //In case of null, unavailable or no call, all likelihoods are 1/3. - private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ - EnumMap likelihoods = new EnumMap(GenotypeType.class); - likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); - likelihoods.put(GenotypeType.HET,1.0/3.0); - likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0); - return likelihoods; - } - return genotype.getLikelihoods().getAsMap(true); - } - - //Returns the GenotypeType; returns UNVAILABLE if given null - private GenotypeType getTypeSafeNull(Genotype genotype){ - if(genotype == null) - return GenotypeType.UNAVAILABLE; - return genotype.getType(); - } - - - /** - * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. - * @param ref: Reference allele - * @param alt: Alternative allele - * @param mother: Mother's genotype - * @param father: Father's genotype - * @param child: Child's genotype - * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child - * @return - */ - private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - - //Check whether it is a pair or trio - //Always assign the first parent as the parent having genotype information in pairs - //Always assign the mother as the first parent in trios - int parentsCalled = 0; - Map firstParentLikelihoods; - Map secondParentLikelihoods; - ArrayList bestFirstParentGenotype = new ArrayList(); - ArrayList bestSecondParentGenotype = new ArrayList(); - ArrayList bestChildGenotype = new ArrayList(); - GenotypeType pairSecondParentGenotype = null; - if(mother == null || !mother.isCalled()){ - firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); - secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); - bestFirstParentGenotype.add(getTypeSafeNull(father)); - bestSecondParentGenotype.add(getTypeSafeNull(mother)); - pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType(); - if(father != null && father.isCalled()) - parentsCalled = 1; - } - else{ - firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); - secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); - bestFirstParentGenotype.add(getTypeSafeNull(mother)); - bestSecondParentGenotype.add(getTypeSafeNull(father)); - if(father == null || !father.isCalled()){ - parentsCalled = 1; - pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType(); - }else{ - parentsCalled = 2; - } - } - Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); - bestChildGenotype.add(getTypeSafeNull(child)); - - //Prior vars - double bestConfigurationLikelihood = 0.0; - double norm = 0.0; - int configuration_index =0; - ArrayList bestMVCount = new ArrayList(); - bestMVCount.add(0); - - //Get the most likely combination - //Only check for most likely combination if at least a parent and the child have genotypes - if(child.isCalled() && parentsCalled > 0){ - int mvCount; - int cumulativeMVCount = 0; - double configurationLikelihood = 0; - for(Map.Entry childGenotype : childLikelihoods.entrySet()){ - for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ - for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ - mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); - //For parent/child pairs, sum over the possible genotype configurations of the missing parent - if(parentsCalled<2){ - cumulativeMVCount += mvCount; - configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); - } - //Evaluate configurations of trios - else{ - configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); - norm += configurationLikelihood; - //Keep this combination if - //It has a better likelihood - //Or it has the same likelihood but requires less changes from original genotypes - if (configurationLikelihood > bestConfigurationLikelihood){ - bestConfigurationLikelihood = configurationLikelihood; - bestMVCount.clear(); - bestMVCount.add(mvCount); - bestFirstParentGenotype.clear(); - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.clear(); - bestSecondParentGenotype.add(secondParentGenotype.getKey()); - bestChildGenotype.clear(); - bestChildGenotype.add(childGenotype.getKey()); - } - else if(configurationLikelihood == bestConfigurationLikelihood) { - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.add(secondParentGenotype.getKey()); - bestChildGenotype.add(childGenotype.getKey()); - bestMVCount.add(mvCount); - } - } - } - //Evaluate configurations of parent/child pairs - if(parentsCalled<2){ - norm += configurationLikelihood; - //Keep this combination if - //It has a better likelihood - //Or it has the same likelihood but requires less changes from original genotypes - if (configurationLikelihood > bestConfigurationLikelihood){ - bestConfigurationLikelihood = configurationLikelihood; - bestMVCount.clear(); - bestMVCount.add(cumulativeMVCount/3); - bestChildGenotype.clear(); - bestFirstParentGenotype.clear(); - bestSecondParentGenotype.clear(); - bestChildGenotype.add(childGenotype.getKey()); - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.add(pairSecondParentGenotype); - } - else if(configurationLikelihood == bestConfigurationLikelihood) { - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.add(pairSecondParentGenotype); - bestChildGenotype.add(childGenotype.getKey()); - bestMVCount.add(cumulativeMVCount/3); - } - configurationLikelihood = 0; - } - } - } - - //normalize the best configuration probability - bestConfigurationLikelihood = bestConfigurationLikelihood / norm; - - //In case of multiple equally likely combinations, take a random one - if(bestFirstParentGenotype.size()>1){ - configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); - } - - } - else{ - bestConfigurationLikelihood = NO_TRANSMISSION_PROB; - } - - TrioPhase phasedTrioGenotypes; - if(parentsCalled < 2 && mother == null || !mother.isCalled()) - phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - else - phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - - //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return bestMVCount.get(configuration_index); - - } - - - private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ - - //Increment metrics counters - if(parent.isCalled() && child.isCalled()){ - counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); - if(parent.isPhased()) - counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); - else{ - counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); - if(parent.isHet() && child.isHet()) - counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); - } - }else{ - counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); - } - - } - - private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ - - //Increment metrics counters - if(mother.isCalled() && father.isCalled() && child.isCalled()){ - counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); - if(mother.isPhased()) - counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); - - else{ - if(mvCount > 0){ - if(mvCount >1) - counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); - else - counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); - } - else if(mother.isHet() && father.isHet() && child.isHet()) - counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); - - } - }else{ - counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); - } - } - - /** - * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype - * - * @param tracker the reference meta-data tracker - * @param ref the reference context - * @param context the alignment context - * @return null - */ - @Override - public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - HashMap metricsCounters = new HashMap(10); - metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_TRIO_HET_HET_HET,0); - metricsCounters.put(NUM_TRIO_VIOLATIONS,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_PAIR_HET_HET,0); - metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); - metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); - - String mvfLine; - - if (tracker == null) - return metricsCounters; - - final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); - if ( vc == null ) - return metricsCounters; - - if ( !vc.isBiallelic() ) { - vcfWriter.add(vc); - return metricsCounters; - } - - final VariantContextBuilder builder = new VariantContextBuilder(vc); - - final GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); - for (Sample sample : trios) { - Genotype mother = vc.getGenotype(sample.getMaternalID()); - Genotype father = vc.getGenotype(sample.getPaternalID()); - Genotype child = vc.getGenotype(sample.getID()); - - //Keep only trios and parent/child pairs - if(mother == null && father == null || child == null) - continue; - - ArrayList trioGenotypes = new ArrayList(3); - final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); - - Genotype phasedMother = trioGenotypes.get(0); - Genotype phasedFather = trioGenotypes.get(1); - Genotype phasedChild = trioGenotypes.get(2); - - //Fill the genotype map with the new genotypes and increment metrics counters - genotypesContext.replace(phasedChild); - if(mother != null){ - genotypesContext.replace(phasedMother); - if(father != null){ - genotypesContext.replace(phasedFather); - updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); - mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", - vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), - phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()), - phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); - if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); - } - else{ - updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); - if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); - mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s:%s:%s:%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s", - vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), - phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); - } - } - else{ - genotypesContext.replace(phasedFather); - updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); - if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); - mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", - vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), - phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); - } - - //Report violation if set so - //TODO: ADAPT FOR PAIRS TOO!! - if(mvCount>0 && mvFile != null && !vc.isFiltered()) - mvFile.println(mvfLine); - } - - builder.genotypes(genotypesContext); - vcfWriter.add(builder.make()); - - return metricsCounters; - } - - private static String printAD(final int[] AD) { - if ( AD == null || AD.length == 0 ) - return "."; - final StringBuilder sb = new StringBuilder(); - sb.append(AD[0]); - for ( int i = 1; i < AD.length; i++) { - sb.append(","); - sb.append(AD[i]); - } - return sb.toString(); - } - - /** - * Initializes the reporting counters. - * - * @return All counters initialized to 0 - */ - @Override - public HashMap reduceInit() { - HashMap metricsCounters = new HashMap(10); - metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_TRIO_HET_HET_HET,0); - metricsCounters.put(NUM_TRIO_VIOLATIONS,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_PAIR_HET_HET,0); - metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); - metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); - - return metricsCounters; - } - - /** - * Adds the value of the site phased to the reporting counters. - * - * @param value Site values - * @param sum accumulator for the reporting counters - * @return accumulator with result of the map taken into account. - */ - @Override - public HashMap reduce(HashMap value, HashMap sum) { - sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); - sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); - sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); - sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); - sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); - sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); - sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); - sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); - sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); - sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); - sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); - sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); - - return sum; - } - - - /** - * Reports statistics on the phasing by transmission process. - * @param result Accumulator with all counters. - */ - @Override - public void onTraversalDone(HashMap result) { - logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); - logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); - logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); - logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); - logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); - logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); - logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); - logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); - logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); - logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); - logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); - logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); - - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java deleted file mode 100644 index a297b38cf..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ /dev/null @@ -1,1781 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.io.*; -import java.util.*; - -import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersFromRods; - -/** - * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). - * - *

- * Performs physical phasing of SNP calls, based on sequencing reads. - *

- * - *

Input

- *

- * VCF file of SNP calls, BAM file of sequence reads. - *

- * - *

Output

- *

- * Phased VCF file. - *

- * - *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T ReadBackedPhasing
- *      -R reference.fasta
- *      -I reads.bam
- *      --variant SNPs.vcf
- *      -L SNPs.vcf
- *      -o phased_SNPs.vcf
- *      --phaseQualityThresh 20.0
- * 
- * - * @author Menachem Fromer - * @since July 2010 - */ -@Allows(value = {DataSource.READS, DataSource.REFERENCE}) -@Requires(value = {DataSource.READS, DataSource.REFERENCE}) -@By(DataSource.READS) - -// Filter out all reads with zero mapping quality -@ReadFilters({MappingQualityZeroFilter.class}) - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -public class ReadBackedPhasing extends RodWalker { - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) - protected boolean DEBUG = false; - /** - * The VCF file we are phasing variants from. - * - * All heterozygous variants found in this VCF file will be phased, where possible - */ - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Output(doc = "File to which variants should be written") - protected VariantContextWriter writer = null; - - @Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false) - protected Integer cacheWindow = 20000; - - @Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false) - protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes - - @Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false) - protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 - - @Hidden - @Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files [For DEBUGGING purposes only - DO NOT USE!]", required = false) - protected String variantStatsFilePrefix = null; - private PhasingQualityStatsWriter statsWriter = null; - - @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false) - public int MIN_BASE_QUALITY_SCORE = 17; - - @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false) - public int MIN_MAPPING_QUALITY_SCORE = 20; - - @Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false) - protected Set samplesToPhase = null; - - @Hidden - @Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false) - private boolean permitNoSampleOverlap = false; - - /** - * Important note: do not use this argument if your input data set is not already phased or it will cause the tool to skip over all heterozygous sites. - */ - @Argument(fullName = "respectPhaseInInput", shortName = "respectPhaseInInput", doc = "Will only phase genotypes in cases where the resulting output will necessarily be consistent with any existing phase (for example, from trios)", required = false) - private boolean respectPhaseInInput = false; - - private GenomeLoc mostDownstreamLocusReached = null; - - private LinkedList unphasedSiteQueue = null; - private CloneableIteratorLinkedList partiallyPhasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions - - private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0); - - public static final String PQ_KEY = "PQ"; - - // In order to detect phase inconsistencies: - private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads - private static final double MAX_FRACTION_OF_INCONSISTENT_READS = 0.1; // If there are more than this fraction of inconsistent reads, then flag this site - - public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent"; - - @Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false) - protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false; - - @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false) - protected int maxGenomicDistanceForMNP = 1; - - @Hidden - @Argument(fullName = "outputMultipleBaseCountsFile", shortName = "outputMultipleBaseCountsFile", doc = "File to output cases where a single read has multiple bases at the same position [For DEBUGGING purposes only - DO NOT USE!]", required = false) - protected File outputMultipleBaseCountsFile = null; - private MultipleBaseCountsWriter outputMultipleBaseCountsWriter = null; - - public void initialize() { - if (maxPhaseSites <= 2) - maxPhaseSites = 2; // by definition, must phase a site relative to previous site [thus, 2 in total] - - /* - Since we cap each base quality (BQ) by its read's mapping quality (MQ) [in Read.updateBaseAndQuality()], then: - if minBQ > minMQ, then we require that MQ be >= minBQ as well. - [Otherwise, we end up capping BQ by MQ only AFTER we tried removing bases with BQ < minBQ, which is WRONG!] - - To do this properly, we set: minMQ = max(minMQ, minBQ) - */ - MIN_MAPPING_QUALITY_SCORE = Math.max(MIN_MAPPING_QUALITY_SCORE, MIN_BASE_QUALITY_SCORE); - - unphasedSiteQueue = new LinkedList(); - partiallyPhasedSites = new CloneableIteratorLinkedList(); - - initializeVcfWriter(); - - if (variantStatsFilePrefix != null) - statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix); - - if (outputMultipleBaseCountsFile != null) - outputMultipleBaseCountsWriter = new MultipleBaseCountsWriter(outputMultipleBaseCountsFile); - } - - private void initializeVcfWriter() { - // Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here] - VariantContextWriter origWriter = writer; - - if (enableMergePhasedSegregatingPolymorphismsToMNP) - writer = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter); - - /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow - Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow - Hence, the startDistance between: partiallyPhasedSites.peek() --> mostDownstreamLocusReached is <= 2 * cacheWindow - - Therefore, can write the filtered records located at mostDownstreamLocusReached (if any) to SortingVCFWriter, even though partiallyPhasedSites.peek() has not yet been written. - - But, NOTE that map() is careful to pass out a list of records to be written that FIRST includes any records discarded due to having reached mostDownstreamLocusReached, - and only THEN records located at mostDownstreamLocusReached. The opposite order in map() would violate the startDistance limits imposed when contracting SortingVCFWriter with (2 * cacheWindow). - */ - writer = VariantContextWriterFactory.sortOnTheFly(writer, 2 * cacheWindow, writer != origWriter); - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - // Phasing-specific INFO fields: - hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); - hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?")); - - // todo -- fix samplesToPhase - String trackName = variantCollection.variants.getName(); - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); - writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); - - Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - readSamples.retainAll(vcfSamples); - if (readSamples.isEmpty()) { - String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; - if (permitNoSampleOverlap) - logger.warn(noPhaseString); - else - throw new UserException(noPhaseString); - } - } - - public PhasingStats reduceInit() { - return new PhasingStats(); - } - - /** - * For each site of interest, cache the current site and then use the cache to phase all sites - * for which "sufficient" information has already been observed. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public PhasingStatsAndOutput map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - mostDownstreamLocusReached = ref.getLocus(); - if (DEBUG) logger.debug("map() at: " + mostDownstreamLocusReached); - - PhasingStats phaseStats = new PhasingStats(); - List unprocessedList = new LinkedList(); - - for (VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { - if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase); - - if (ReadBackedPhasing.processVariantInPhasing(vc)) { - VariantAndReads vr = new VariantAndReads(vc, context); - unphasedSiteQueue.add(vr); - - if (DEBUG) - logger.debug("Added variant to queue = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); - } - else { - unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly - - if (DEBUG) - logger.debug("Unprocessed variant = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - } - - int numReads = context.getBasePileup().getNumberOfElements(); - PhasingStats addInPhaseStats = new PhasingStats(numReads, 1); - phaseStats.addIn(addInPhaseStats); - } - - List completedList = processQueue(phaseStats, false); - completedList.addAll(unprocessedList); // add unprocessedList on to the END of completedList so that the processQueue() results, which are necessarily more upstream, are first! - - return new PhasingStatsAndOutput(phaseStats, completedList); - } - - private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); - - private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { -// for ( String sample : samplesToPhase ) -// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); - VariantContext subvc = vc.subContextFromSamples(samplesToPhase); -// logger.debug("original VC = " + vc); -// logger.debug("sub VC = " + subvc); - return GATKVariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); - } - - private List processQueue(PhasingStats phaseStats, boolean processAll) { - List oldPhasedList = new LinkedList(); - - while (!unphasedSiteQueue.isEmpty()) { - if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue - VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant; - if (startDistancesAreInWindowRange(mostDownstreamLocusReached, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextToPhaseVc))) { - /* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc - (note that we ASSUME that the VCF is ordered by ). - Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself. - */ - break; - } - // Already saw all variant positions within cacheWindow startDistance ahead of vc (on its contig) - } - // Update partiallyPhasedSites before it's used in phaseSite: - oldPhasedList.addAll(discardIrrelevantPhasedSites()); - if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList)); - - VariantAndReads vr = unphasedSiteQueue.remove(); - if (DEBUG) - logger.debug("Performing phasing for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); - phaseSite(vr, phaseStats); - } - - // Update partiallyPhasedSites after phaseSite is done: - oldPhasedList.addAll(discardIrrelevantPhasedSites()); - if (DEBUG) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList)); - - if (outputMultipleBaseCountsWriter != null) - outputMultipleBaseCountsWriter.outputMultipleBaseCounts(); - - return oldPhasedList; - } - - private List discardIrrelevantPhasedSites() { - List vcList = new LinkedList(); - - GenomeLoc nextToPhaseLoc = null; - if (!unphasedSiteQueue.isEmpty()) - nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); - - while (!partiallyPhasedSites.isEmpty()) { - if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past" - UnfinishedVariantAndReads partPhasedVr = partiallyPhasedSites.peek(); - - if (startDistancesAreInWindowRange(partPhasedVr.unfinishedVariant.getLocation(), nextToPhaseLoc)) - // nextToPhaseLoc is still not far enough ahead of partPhasedVr to exclude partPhasedVr from calculations - break; - } - UnfinishedVariantAndReads uvr = partiallyPhasedSites.remove(); - vcList.add(uvr.unfinishedVariant.toVariantContext()); - } - - return vcList; - } - - /* Phase vc (removed head of unphasedSiteQueue) using all VariantContext objects in - partiallyPhasedSites, and all in unphasedSiteQueue that are within cacheWindow startDistance ahead of vc (on its contig). - - ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue). - */ - - private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { - VariantContext vc = vr.variant; - logger.debug("Will phase vc = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - - UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr); - UnfinishedVariantContext uvc = uvr.unfinishedVariant; - - // Perform per-sample phasing: - GenotypesContext sampGenotypes = vc.getGenotypes(); - Map samplePhaseStats = new TreeMap(); - for (final Genotype gt : sampGenotypes) { - String samp = gt.getSampleName(); - - if (DEBUG) logger.debug("sample = " + samp); - if (isUnfilteredCalledDiploidGenotype(gt)) { - if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site: - // true <-> can trivially phase a hom site relative to ANY previous site: - Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make(); - uvc.setGenotype(samp, phasedGt); - } - else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype - PhasingWindow phaseWindow = new PhasingWindow(vr, samp); - if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against - SNPallelePair allelePair = new SNPallelePair(gt); - if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); - - CloneableIteratorLinkedList.CloneableIterator prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; - /* Notes: - 1. Call to next() advances iterator to next position in partiallyPhasedSites. - 2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads. - */ - UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant; - Genotype prevHetGenotype = prevUvc.getGenotype(samp); - - PhaseResult pr = phaseSampleAtSite(phaseWindow); - boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality); - - if (pr.phasingContainsInconsistencies) { - if (DEBUG) - logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - uvc.setPhasingInconsistent(); - } - - if (genotypesArePhased) { - SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype); - - if (DEBUG) - logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n"); - if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n"); - - ensurePhasing(allelePair, prevAllelePair, pr.haplotype); - Genotype phasedGt = new GenotypeBuilder(gt) - .alleles(allelePair.getAllelesAsList()) - .attribute(PQ_KEY, pr.phaseQuality) - .phased(genotypesArePhased).make(); - uvc.setGenotype(samp, phasedGt); - } - - // Now, update the 0 or more "interior" hom sites in between the previous het site and this het site: - while (prevHetAndInteriorIt.hasNext()) { - UnfinishedVariantContext interiorUvc = prevHetAndInteriorIt.next().unfinishedVariant; - Genotype handledGt = interiorUvc.getGenotype(samp); - if (handledGt == null || !isUnfilteredCalledDiploidGenotype(handledGt)) - throw new ReviewedStingException("LOGICAL error: should not have breaks WITHIN haplotype"); - if (!handledGt.isHom()) - throw new ReviewedStingException("LOGICAL error: should not have anything besides hom sites IN BETWEEN two het sites"); - - // Use the same phasing consistency and PQ for each hom site in the "interior" as for the het-het phase: - if (pr.phasingContainsInconsistencies) - interiorUvc.setPhasingInconsistent(); - - if (genotypesArePhased) { - Genotype phasedHomGt = new GenotypeBuilder(handledGt) - .attribute(PQ_KEY, pr.phaseQuality) - .phased(genotypesArePhased).make(); - interiorUvc.setGenotype(samp, phasedHomGt); - } - } - - if (statsWriter != null) - statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length); - - PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); - if (sampPhaseCounts == null) { - sampPhaseCounts = new PhaseCounts(); - samplePhaseStats.put(samp, sampPhaseCounts); - } - sampPhaseCounts.numTestedSites++; - - if (pr.phasingContainsInconsistencies) { - if (genotypesArePhased) - sampPhaseCounts.numInconsistentSitesPhased++; - else - sampPhaseCounts.numInconsistentSitesNotPhased++; - } - - if (genotypesArePhased) - sampPhaseCounts.numPhased++; - } - } - } - } - - partiallyPhasedSites.add(uvr); // only add it in now, since don't want it to be there during phasing - phaseStats.addIn(new PhasingStats(samplePhaseStats)); - } - - public boolean passesPhasingThreshold(double PQ) { - return PQ >= phaseQualityThresh; - } - - private static class GenotypeAndReadBases { - public Genotype genotype; - public ReadBasesAtPosition readBases; - public GenomeLoc loc; - - public GenotypeAndReadBases(Genotype genotype, ReadBasesAtPosition readBases, GenomeLoc loc) { - this.genotype = genotype; - this.readBases = readBases; - this.loc = loc; - } - } - - private class PhasingWindow { - private Genotype[] hetGenotypes = null; - private CloneableIteratorLinkedList.CloneableIterator prevHetAndInteriorIt = null; - private int phasingSiteIndex = -1; - private Map readsAtHetSites = null; - - private void clearFields() { - hetGenotypes = null; - prevHetAndInteriorIt = null; - phasingSiteIndex = -1; - readsAtHetSites = null; - } - - public boolean hasPreviousHets() { - return phasingSiteIndex > 0; - } - - // ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)] - - public PhasingWindow(VariantAndReads vr, String sample) { - List listHetGenotypes = new LinkedList(); - - // Include previously phased sites in the phasing computation: - CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); - while (phasedIt.hasNext()) { - UnfinishedVariantAndReads phasedVr = phasedIt.next(); - Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break" - listHetGenotypes.clear(); // clear out any history - } - else if (gt.isHet()) { - GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation()); - listHetGenotypes.add(grb); - if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); - prevHetAndInteriorIt = phasedIt.clone(); - } - } - phasingSiteIndex = listHetGenotypes.size(); - if (phasingSiteIndex == 0) { // no previous sites against which to phase - clearFields(); - return; - } - prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()] - - if (respectPhaseInInput) { - Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample); - if (!prevHetGenotype.isPhased()) { - // Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]: - clearFields(); - return; - } - } - - // Add the (het) position to be phased: - GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant); - GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); - listHetGenotypes.add(grbPhase); - if (DEBUG) - logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); - - // Include as-of-yet unphased sites in the phasing computation: - for (VariantAndReads nextVr : unphasedSiteQueue) { - if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc - break; - Genotype gt = nextVr.variant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break" - break; - } - else if (gt.isHet()) { - GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant)); - listHetGenotypes.add(grb); - if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc); - } - } - - // First, assemble the "sub-reads" from the COMPLETE WINDOW-BASED SET of heterozygous positions for this sample: - buildReadsAtHetSites(listHetGenotypes, sample, grbPhase.loc); - - // Remove extraneous reads (those that do not "connect" the two core phasing sites): - Set onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); - - // Dynamically modify the window to only include sites which have a non-empty set of reads: - listHetGenotypes = removeExtraneousSites(listHetGenotypes); - - // In any case, must still trim the window size to be "feasible" - // [**NOTE**: May want to do this to try maximize the preservation of paths from (phasingSiteIndex - 1) to phasingSiteIndex]: - if (listHetGenotypes.size() > maxPhaseSites) { - listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); - - // Can now remove any extra reads (and then sites): - buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); - onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); - listHetGenotypes = removeExtraneousSites(listHetGenotypes); - } - - // Lastly, assemble the "sub-reads" from the FINAL SET of heterozygous positions for this sample: - buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); - - // Copy to a fixed-size array: - if (DEBUG) - logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); - hetGenotypes = new Genotype[listHetGenotypes.size()]; - int index = 0; - for (GenotypeAndReadBases copyGrb : listHetGenotypes) - hetGenotypes[index++] = copyGrb.genotype; - } - - private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc) { - buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null); - } - - private void buildReadsAtHetSites(List listHetGenotypes, Set onlyKeepReads) { - buildReadsAtHetSites(listHetGenotypes, null, null, onlyKeepReads); - } - - private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc, Set onlyKeepReads) { - readsAtHetSites = new HashMap(); - - int index = 0; - for (GenotypeAndReadBases grb : listHetGenotypes) { - ReadBasesAtPosition readBases = grb.readBases; - if (readBases != null) { - for (ReadBase rb : readBases) { - String readName = rb.readName; - if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads - continue; - - PhasingRead rd = readsAtHetSites.get(readName); - if (rd == null) { - rd = new PhasingRead(listHetGenotypes.size(), rb.mappingQual); - readsAtHetSites.put(readName, rd); - } - else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index - && sample != null && phasingLoc != null) { - outputMultipleBaseCountsWriter.setMultipleBases(new SampleReadLocus(sample, readName, grb.loc), phasingLoc, rd.getBase(index), rb.base); - } - - // Arbitrarily updates to the last base observed for this sample and read (rb.base): - rd.updateBaseAndQuality(index, rb.base, rb.baseQual); - } - } - index++; - } - if (DEBUG) logger.debug("Number of sites in window = " + index); - - if (DEBUG && logger.isDebugEnabled()) { - logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:"); - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - String rdName = nameToReads.getKey(); - PhasingRead rd = nameToReads.getValue(); - logger.debug(rd + "\t" + rdName); - } - } - } - - private class EdgeToReads { - private TreeMap> edgeReads; - - public EdgeToReads() { - this.edgeReads = new TreeMap>(); // implemented GraphEdge.compareTo() - } - - public void addRead(PhasingGraphEdge e, String readName) { - List reads = edgeReads.get(e); - if (reads == null) { - reads = new LinkedList(); - edgeReads.put(e, reads); - } - reads.add(readName); - } - - public List getReads(PhasingGraphEdge e) { - return edgeReads.get(e); - } - } - - private class IntegerSet implements Iterable { - private Set list; - - public IntegerSet(Set list) { - this.list = list; - } - - public boolean contains(int i) { - return list.contains(i); - } - - public Iterator iterator() { - return list.iterator(); - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - for (int i : this) { - sb.append(i + ", "); - } - return sb.toString(); - } - } - - public Set removeExtraneousReads(int numHetSites) { - PhasingGraph readGraph = new PhasingGraph(numHetSites); - EdgeToReads edgeToReads = new EdgeToReads(); - Set sitesWithEdges = new TreeSet(); - - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - String rdName = nameToReads.getKey(); - PhasingRead rd = nameToReads.getValue(); - - int[] siteInds = rd.getNonNullIndices(); - // Connect each pair of non-null sites in rd: - for (int i = 0; i < siteInds.length; i++) { - for (int j = i + 1; j < siteInds.length; j++) { - PhasingGraphEdge e = new PhasingGraphEdge(siteInds[i], siteInds[j]); - if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e); - readGraph.addEdge(e); - - edgeToReads.addRead(e, rdName); - - sitesWithEdges.add(e.getV1()); - sitesWithEdges.add(e.getV2()); - } - } - } - if (DEBUG) logger.debug("Read graph:\n" + readGraph); - Set keepReads = new HashSet(); - - /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex): - - In detail: - Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph". - A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by - considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and - cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant. - By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example, - if a path exists from prev ---> v1 that excludes v2. - - Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well. - And, if the path uses edges from other reads, then keeping all other reads that contribute those edges - [which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist. - - NOTE: - If we would use NON-UNIFORM priors for the various haplotypes consistent with a margnialized haplotype, then this calculation would not be correct, since the equivalence of: - 1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values). - 2. The read has edges involved in a path from prev ---> cur. - DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior. - - This is due to the following: - [We denote: - R = set of all reads - r = a single read - "AA + CC" = AA on top chromosome, CC on bottom chromosome] - - Note that since there are only two haplotype possibilities: - P(AA + CC | R) + P(AC + CA | R) = 1 - - Now, if we assume that all haplotypes consistent with AA + CC have the same prior probability [P(AA + CC | R)], then: - P(AA + CC | R) - = P(AAAA + CCCC | R) + ... + P(AACC + CCAA | R) - = [P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R)] / P(R) - \propto P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R) - = P(R | AAAA + CCCC)*P(AAAA + CCCC) + ... + P(R | AACC + CCAA)*P(AACC + CCAA) - = P(AA + CC | R) * [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] - - Since we assume independence between reads given a particular haplotype [P(R | AAAA + CCCC) = \prod_r P(r | AAAA + CCCC)], - a new read r affects P(AA + CC | R) by multiplying each of the terms in the sum by, e.g., P(r | AAAA + CCCC). - Therefore, if these values do not affect the ratio of: - (I) [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] / [P(R | ACAA + CACC) + ... + P(R | ACCC + CAAA)] - then they do not affect the value of: - (II) P(AA + CC | R) / P(AC + CA | R) [which uniquely defines their values, since they sum to 1] - - And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R. - */ - int prev = phasingSiteIndex - 1; - int cur = phasingSiteIndex; - - if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev - if (DEBUG) - logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]"); - readsAtHetSites.clear(); - return keepReads; - } - - /* Check the connected components of prev and cur when removing each individual vertex's edges: - [Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)] - */ - IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites]; - IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites]; - for (int i : sitesWithEdges) { - if (DEBUG) logger.debug("Calculating CC after removing edges of site: " + i); - - // Remove all edges incident to i and see which positions have paths to prev and cur: - Collection removedEdges = readGraph.removeAllIncidentEdges(i); - - // Run-time for efficiently calculating connected components using DisjointSet: O(E) - DisjointSet ccAfterRemove = readGraph.getConnectedComponents(); - removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges)); - removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges)); - - if (DEBUG) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]); - if (DEBUG) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]); - - // Add the removed edges back in: - readGraph.addEdges(removedEdges); - } - - for (PhasingGraphEdge e : readGraph) { - if (DEBUG) logger.debug("Testing the path-connectivity of Edge: " + e); - - /* Edge e={v1,v2} contributes a path between prev and cur for testRead iff: - testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa]. - Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges, - and the path from v2 ---> cur will NOT contain v1. - */ - boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.getV1()].contains(e.getV2()) && removedSiteSameCCAsCur[e.getV2()].contains(e.getV1()); - boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.getV2()].contains(e.getV1()) && removedSiteSameCCAsCur[e.getV1()].contains(e.getV2()); - - if (prevTo2and1ToCur || prevTo1and2ToCur) { - for (String readName : edgeToReads.getReads(e)) { - keepReads.add(readName); - - if (DEBUG && logger.isDebugEnabled()) { - if (prevTo2and1ToCur) - logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV2() + " -> " + e.getV1() + " ---> " + cur); - else - logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV1() + " -> " + e.getV2() + " ---> " + cur); - } - } - } - } - - // Retain only the reads that contain an edge in a path connecting prev and cur: - Iterator> readIt = readsAtHetSites.entrySet().iterator(); - while (readIt.hasNext()) { - Map.Entry nameToReads = readIt.next(); - String rdName = nameToReads.getKey(); - if (!keepReads.contains(rdName)) { - readIt.remove(); - if (DEBUG) logger.debug("Removing extraneous read: " + rdName); - } - } - - return keepReads; - } - - private List removeExtraneousSites(List listHetGenotypes) { - Set sitesWithReads = new HashSet(); - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - PhasingRead rd = nameToReads.getValue(); - for (int i : rd.getNonNullIndices()) - sitesWithReads.add(i); - } - - // Remove all sites that have no read bases: - List keepHetSites = new LinkedList(); - int index = 0; - int numPrecedingRemoved = 0; - for (GenotypeAndReadBases grb : listHetGenotypes) { - boolean keepSite = sitesWithReads.contains(index); - if (DEBUG && logger.isDebugEnabled() && !keepSite) - logger.debug("Removing read-less site " + grb.loc); - - if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) { - keepHetSites.add(grb); - if (!keepSite) - if (DEBUG) - logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); - } - else if (index <= phasingSiteIndex) - numPrecedingRemoved++; - - index++; - } - - phasingSiteIndex -= numPrecedingRemoved; - return keepHetSites; - } - - private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { - if (DEBUG) - logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); - - int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes - int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! - - int numOnLeft = prevSiteIndex; - int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1); - - int useOnLeft, useOnRight; - if (numOnLeft <= numOnRight) { - int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side - useOnLeft = Math.min(halfToUse, numOnLeft); - useOnRight = Math.min(numToUse - useOnLeft, numOnRight); - } - else { // numOnRight < numOnLeft - int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil] - useOnRight = Math.min(halfToUse, numOnRight); - useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); - } - int startIndex = prevSiteIndex - useOnLeft; - int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep - phasingSiteIndex -= startIndex; - listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex); - if (DEBUG) - logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); - - return listHetGenotypes; - } - } - - private PhaseResult phaseSampleAtSite(PhasingWindow phaseWindow) { - /* Will map a phase and its "complement" to a single representative phase, - and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]: - */ - HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2); - PhasingTable sampleHaps = tabCreator.getNewTable(); - - if (DEBUG && logger.isDebugEnabled()) { - logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size()); - logger.debug("USED READS:"); - for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { - String rdName = nameToReads.getKey(); - PhasingRead rd = nameToReads.getValue(); - logger.debug(rd + "\t" + rdName); - } - } - - // Update the phasing table based on each of the sub-reads for this sample: - MaxHaplotypeAndQuality prevMaxHapAndQual = null; - - int numHighQualityIterations = 0; - int numInconsistentIterations = 0; - - double totalAbsPQchange = 0; - int numPQchangesObserved = 0; - - for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { - PhasingRead rd = nameToReads.getValue(); - if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); - - for (PhasingTable.PhasingTableEntry pte : sampleHaps) { - PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); - pte.getScore().integrateReadScore(score); - if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); - } - - // Check the current best haplotype assignment and compare it to the previous one: - MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); - if (DEBUG) - logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); - if (prevMaxHapAndQual != null) { - double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; - - if (passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) { - numHighQualityIterations++; - if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase - (numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ - if (DEBUG) logger.debug("Inconsistent read found!"); - numInconsistentIterations++; - } - } - - totalAbsPQchange += Math.abs(changeInPQ); - numPQchangesObserved++; - } - prevMaxHapAndQual = curMaxHapAndQual; - } - - if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); - MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, DEBUG); - double posteriorProb = maxHapQual.maxEntry.getScore().getValue(); - - if (DEBUG) - logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality); - if (DEBUG) - logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations); - - boolean phasingContainsInconsistencies = false; - if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS) - phasingContainsInconsistencies = true; - - return new PhaseResult(maxHapQual.getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies); - } - - private static class MaxHaplotypeAndQuality { - public PhasingTable.PhasingTableEntry maxEntry; - public double phaseQuality; - - public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) { - // Marginalize each haplotype to its first 2 positions: - hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable); - if (printDebug) - logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n"); - - calculateMaxHapAndPhasingQuality(hapTable, printDebug); - } - - // Calculates maxEntry and its PQ (within table hapTable): - - private void calculateMaxHapAndPhasingQuality(PhasingTable hapTable, boolean printDebug) { - hapTable.normalizeScores(); - if (printDebug) - logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n"); - - // Determine the phase at this position: - this.maxEntry = hapTable.maxEntry(); - - // convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.trueProbToQual(posteriorProb): - PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO); - for (PhasingTable.PhasingTableEntry pte : hapTable) { - if (pte != maxEntry) - sumErrorProbs.plusEqual(pte.getScore()); - } - this.phaseQuality = -10.0 * (sumErrorProbs.getLog10Value()); - } - - public boolean hasSameRepresentativeHaplotype(MaxHaplotypeAndQuality that) { - return this.getRepresentative().equals(that.getRepresentative()); - } - - private Haplotype getRepresentative() { - return maxEntry.getHaplotypeClass().getRepresentative(); - } - } - - /* - Ensure that curAllelePair is phased relative to prevAllelePair as specified by hap. - */ - - public static void ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, Haplotype hap) { - if (hap.size() < 2) - throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!"); - - byte prevBase = hap.getBase(0); // The 1st base in the haplotype - byte curBase = hap.getBase(1); // The 2nd base in the haplotype - - boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); - boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); - if (chosePrevTopChrom != choseCurTopChrom) - curAllelePair.swapAlleles(); - } - - private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { - return startDistancesAreInWindowRange(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc1), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); - } - - private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) { - return loc1.distance(loc2) <= cacheWindow; // distance() checks: loc1.onSameContig(loc2) - } - - private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) { - return uvc1.getLocation().distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); - } - - public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) { - if (statsAndList != null) { - writeVcList(statsAndList.output); - stats.addIn(statsAndList.ps); - } - return stats; - } - - /** - * Phase anything left in the cached unphasedSiteQueue, and report the number of reads and VariantContexts processed. - * - * @param result the number of reads and VariantContexts seen. - */ - public void onTraversalDone(PhasingStats result) { - List finalList = processQueue(result, true); // process all remaining data - writeVcList(finalList); - writer.close(); - - if (statsWriter != null) - statsWriter.close(); - - if (outputMultipleBaseCountsWriter != null) - outputMultipleBaseCountsWriter.close(); - - System.out.println("Coverage over ALL samples:"); - System.out.println("Number of reads observed: " + result.getNumReads()); - System.out.println("Number of variant sites observed: " + result.getNumVarSites()); - System.out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites())); - - System.out.println("\n--- Phasing summary [minimal haplotype quality (PQ): " + phaseQualityThresh + ", maxPhaseSites: " + maxPhaseSites + ", cacheWindow: " + cacheWindow + "] ---"); - for (Map.Entry sampPhaseCountEntry : result.getPhaseCounts()) { - PhaseCounts pc = sampPhaseCountEntry.getValue(); - System.out.print("Sample: " + sampPhaseCountEntry.getKey() + "\tSites tested: " + pc.numTestedSites + "\tSites phased: " + pc.numPhased); - System.out.println("\tPhase-inconsistent sites: " + (pc.numInconsistentSitesPhased + pc.numInconsistentSitesNotPhased) + " [phased: " + pc.numInconsistentSitesPhased + ", unphased:" + pc.numInconsistentSitesNotPhased + "]"); - } - System.out.println(""); - } - - private void writeVcList(List varContList) { - for (VariantContext vc : varContList) - writeVCF(vc); - } - - private void writeVCF(VariantContext vc) { - if (samplesToPhase == null || vc.isNotFiltered()) - //if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant - writer.add(vc); - } - - public static boolean processVariantInPhasing(VariantContext vc) { - return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || !vc.isVariant()); // we can handle the non-variant case as well - //return isUnfilteredBiallelicSNP(vc); - } - - - /* - Inner classes: - */ - - private class VariantAndReads { - public VariantContext variant; - public HashMap sampleReadBases; - - public VariantAndReads(VariantContext variant, HashMap sampleReadBases) { - this.variant = variant; - this.sampleReadBases = sampleReadBases; - } - - public VariantAndReads(VariantContext variant, AlignmentContext alignment) { - this.variant = variant; - this.sampleReadBases = new HashMap(); - - if (alignment != null) { - ReadBackedPileup pileup = alignment.getBasePileup(); - if (pileup != null) { - // filter the read-base pileup based on min base and mapping qualities: - pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE); - if (pileup != null) { - for (final String sample : pileup.getSamples()) { - ReadBackedPileup samplePileup = pileup.getPileupForSample(sample); - ReadBasesAtPosition readBases = new ReadBasesAtPosition(); - for (PileupElement p : samplePileup) { - if (!p.isDeletion()) // IGNORE deletions for now - readBases.putReadBase(p); - } - sampleReadBases.put(sample, readBases); - } - } - } - } - } - } - - private class UnfinishedVariantAndReads { - public UnfinishedVariantContext unfinishedVariant; - public HashMap sampleReadBases; - - public UnfinishedVariantAndReads(VariantAndReads vr) { - this.unfinishedVariant = new UnfinishedVariantContext(vr.variant); - this.sampleReadBases = vr.sampleReadBases; - } - } - - // COULD replace with MutableVariantContext if it worked [didn't throw exceptions when trying to call its set() methods]... - - private class UnfinishedVariantContext implements HasGenomeLocation { - private String name; - private String contig; - private int start; - private int stop; - private Collection alleles; - private Map genotypes; - private double log10PError; - private Set filters; - private Map attributes; - private String id; - - public UnfinishedVariantContext(VariantContext vc) { - this.name = vc.getSource(); - this.id = vc.getID(); - this.contig = vc.getChr(); - this.start = vc.getStart(); - this.stop = vc.getEnd(); - this.alleles = vc.getAlleles(); - - this.genotypes = new HashMap(); - for ( final Genotype g : vc.getGenotypes() ) { - this.genotypes.put(g.getSampleName(), g); - } - - this.log10PError = vc.getLog10PError(); - this.filters = vc.filtersWereApplied() ? vc.getFilters() : null; - this.attributes = new HashMap(vc.getAttributes()); - } - - public VariantContext toVariantContext() { - GenotypesContext gc = GenotypesContext.copy(this.genotypes.values()); - return new VariantContextBuilder(name, contig, start, stop, alleles).id(id) - .genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make(); - } - - public GenomeLoc getLocation() { - return getToolkit().getGenomeLocParser().createGenomeLoc(contig, start, stop); - } - - public Genotype getGenotype(String sample) { - return genotypes.get(sample); - } - - public void setGenotype(String sample, Genotype newGt) { - this.genotypes.put(sample, newGt); - } - - public void setPhasingInconsistent() { - attributes.put(PHASING_INCONSISTENT_KEY, true); - } - } - - private static String toStringGRL(List grbList) { - boolean first = true; - StringBuilder sb = new StringBuilder(); - for (GenotypeAndReadBases grb : grbList) { - if (first) - first = false; - else - sb.append(" -- "); - - sb.append(grb.loc); - } - return sb.toString(); - } - - private String toStringVCL(List vcList) { - boolean first = true; - StringBuilder sb = new StringBuilder(); - for (VariantContext vc : vcList) { - if (first) - first = false; - else - sb.append(" -- "); - - sb.append(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - } - return sb.toString(); - } - -// -// THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases() -// FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH -// AN ArrayList OF Allele [OR SIMILAR OBJECT], and WON'T USE: getSingleBase(alleleI) -// - - private static abstract class HaplotypeTableCreator { - protected Genotype[] genotypes; - - public HaplotypeTableCreator(Genotype[] hetGenotypes) { - this.genotypes = hetGenotypes; - } - - abstract public PhasingTable getNewTable(); - - protected List getAllHaplotypes() { - int numSites = genotypes.length; - int[] genotypeCards = new int[numSites]; - for (int i = 0; i < numSites; i++) - genotypeCards[i] = genotypes[i].getPloidy(); - - LinkedList allHaps = new LinkedList(); - CardinalityCounter alleleCounter = new CardinalityCounter(genotypeCards); - for (int[] alleleInds : alleleCounter) { - byte[] hapBases = new byte[numSites]; - for (int i = 0; i < numSites; i++) { - Allele alleleI = genotypes[i].getAllele(alleleInds[i]); - hapBases[i] = SNPallelePair.getSingleBase(alleleI); - } - allHaps.add(new Haplotype(hapBases)); - } - return allHaps; - } - - public static PhasingTable marginalizeAsNewTable(PhasingTable table) { - TreeMap hapMap = new TreeMap(); - for (PhasingTable.PhasingTableEntry pte : table) { - Haplotype rep = pte.getHaplotypeClass().getRepresentative(); - PreciseNonNegativeDouble score = hapMap.get(rep); - if (score == null) { - score = new PreciseNonNegativeDouble(ZERO); - hapMap.put(rep, score); - } - score.plusEqual(pte.getScore()); - } - - PhasingTable margTable = new PhasingTable(); - for (Map.Entry hapClassAndScore : hapMap.entrySet()) { - Haplotype rep = hapClassAndScore.getKey(); - ArrayList hapList = new ArrayList(); - hapList.add(rep); - - HaplotypeClass hc = new HaplotypeClass(hapList, rep); - margTable.addEntry(hc, hapClassAndScore.getValue()); - } - return margTable; - } - } - - private static class TableCreatorOfHaplotypeAndComplementForDiploidAlleles extends HaplotypeTableCreator { - private SNPallelePair[] SNPallelePairs; - private int startIndex; - private int marginalizeLength; - - public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) { - super(hetGenotypes); - - this.SNPallelePairs = new SNPallelePair[genotypes.length]; - for (int i = 0; i < genotypes.length; i++) - SNPallelePairs[i] = new SNPallelePair(genotypes[i]); - - this.startIndex = startIndex; - this.marginalizeLength = marginalizeLength; - } - - public PhasingTable getNewTable() { - PhasingTable table = new PhasingTable(); - for (Haplotype hap : getAllHaplotypes()) { - if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) { - /* hap is the "representative" haplotype [DEFINED here to be - the one with the top base at the startIndex position. - NOTE that it is CRITICAL that this definition be consistent with the representative sub-haplotypes defined below!] - */ - ArrayList hapList = new ArrayList(); - hapList.add(hap); - hapList.add(complement(hap)); - - // want marginalizeLength positions starting at startIndex: - Haplotype rep = hap.subHaplotype(startIndex, startIndex + marginalizeLength); - double hapClassPrior = getHaplotypeRepresentativePrior(rep); // Note that prior is ONLY a function of the representative haplotype - - HaplotypeClass hapClass = new HaplotypeClass(hapList, rep); - table.addEntry(hapClass, hapClassPrior); - } - } - return table; - } - - // Can change later to weight the representative Haplotypes differently: - - private double getHaplotypeRepresentativePrior(Haplotype rep) { - return 1.0; - } - - private Haplotype complement(Haplotype hap) { - int numSites = SNPallelePairs.length; - if (hap.size() != numSites) - throw new ReviewedStingException("INTERNAL ERROR: hap.size() != numSites"); - - // Take the other base at EACH position of the Haplotype: - byte[] complementBases = new byte[numSites]; - for (int i = 0; i < numSites; i++) - complementBases[i] = SNPallelePairs[i].getOtherBase(hap.getBase(i)); - - return new Haplotype(complementBases); - } - } - - private static class PhasingTable implements Iterable { - private LinkedList table; - - public PhasingTable() { - this.table = new LinkedList(); - } - - public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, PreciseNonNegativeDouble initialScore) { - PhasingTableEntry pte = new PhasingTableEntry(haplotypeClass, new PhasingScore(initialScore)); - table.add(pte); - return pte; - } - - public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, double initialScore) { - return addEntry(haplotypeClass, new PreciseNonNegativeDouble(initialScore)); - } - - public Iterator iterator() { - return table.iterator(); - } - - public boolean isEmpty() { - return table.isEmpty(); - } - - public PhasingTableEntry maxEntry() { - if (table.isEmpty()) - return null; - - PhasingTableEntry maxPte = null; - for (PhasingTableEntry pte : table) { - if (maxPte == null || pte.getScore().gt(maxPte.getScore())) { - maxPte = pte; - } - } - return maxPte; - } - - public void normalizeScores() { - PreciseNonNegativeDouble normalizeBy = new PreciseNonNegativeDouble(ZERO); - for (PhasingTableEntry pte : table) - normalizeBy.plusEqual(pte.getScore()); - - if (!normalizeBy.equals(ZERO)) { // prevent precision problems - for (PhasingTableEntry pte : table) - pte.getScore().divEqual(normalizeBy); - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("-------------------\n"); - for (PhasingTableEntry pte : this) { - sb.append("Haplotypes:\t" + pte.getHaplotypeClass() + "\tScore:\t" + pte.getScore() + "\n"); - } - sb.append("-------------------\n"); - return sb.toString(); - } - - public static class PhasingTableEntry implements Comparable { - private HaplotypeClass haplotypeClass; - private PhasingScore score; - - public PhasingTableEntry(HaplotypeClass haplotypeClass, PhasingScore score) { - this.haplotypeClass = haplotypeClass; - this.score = score; - } - - public HaplotypeClass getHaplotypeClass() { - return haplotypeClass; - } - - public PhasingScore getScore() { - return score; - } - - public int compareTo(PhasingTableEntry that) { - return this.getScore().compareTo(that.getScore()); - } - } - } - - private static class PhaseResult { - public Haplotype haplotype; - public double phaseQuality; - public boolean phasingContainsInconsistencies; - - public PhaseResult(Haplotype haplotype, double phaseQuality, boolean phasingContainsInconsistencies) { - this.haplotype = haplotype; - this.phaseQuality = phaseQuality; - this.phasingContainsInconsistencies = phasingContainsInconsistencies; - } - } - - public static boolean isUnfilteredBiallelicSNP(VariantContext vc) { - return (vc.isNotFiltered() && vc.isSNP() && vc.isBiallelic()); - } - - public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) { - return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2); - } - - private class MultipleBaseCountsWriter { - private BufferedWriter writer = null; - private TreeMap multipleBaseCounts = null; - - public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) { - FileOutputStream output; - try { - output = new FileOutputStream(outputMultipleBaseCountsFile); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to create multiple base count file at location: " + outputMultipleBaseCountsFile); - } - this.writer = new BufferedWriter(new OutputStreamWriter(output)); - - this.multipleBaseCounts = new TreeMap(); // implemented SampleReadLocus.compareTo() - } - - public void setMultipleBases(SampleReadLocus srl, GenomeLoc phasingLoc, byte prevBase, byte newBase) { - MultipleBaseCounts mbc = multipleBaseCounts.get(srl); - if (mbc == null) { - mbc = new MultipleBaseCounts(phasingLoc); - mbc.incrementBaseCount(prevBase); // only now, do we know to note this - multipleBaseCounts.put(srl, mbc); - } - if (mbc.samePhasingLocAs(phasingLoc)) // otherwise, don't want to count these multiple base counts again - mbc.incrementBaseCount(newBase); - - } - - public void outputMultipleBaseCounts() { - GenomeLoc nextToPhaseLoc = null; - if (!unphasedSiteQueue.isEmpty()) - nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); - - outputMultipleBaseCounts(nextToPhaseLoc); - } - - private void outputMultipleBaseCounts(GenomeLoc nextToPhaseLoc) { - try { - Iterator> multBaseCountIt = multipleBaseCounts.entrySet().iterator(); - while (multBaseCountIt.hasNext()) { - Map.Entry sampleReadLocBaseCountsEntry = multBaseCountIt.next(); - SampleReadLocus srl = sampleReadLocBaseCountsEntry.getKey(); - if (nextToPhaseLoc == null || !startDistancesAreInWindowRange(srl.getLocus(), nextToPhaseLoc)) { - // Done with entry, so print it and remove it from map: - writer.write(srl + "\t" + sampleReadLocBaseCountsEntry.getValue() + "\n"); - multBaseCountIt.remove(); - } - } - writer.flush(); - } catch (IOException e) { - throw new RuntimeException("Unable to write to outputMultipleBaseCountsFile", e); - } - } - - public void close() { - outputMultipleBaseCounts(null); - - try { - writer.flush(); - writer.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close outputMultipleBaseCountsFile"); - } - } - } -} - - -class PhasingScore extends PreciseNonNegativeDouble { - public PhasingScore(double score) { - super(score); - } - - public PhasingScore(PreciseNonNegativeDouble val) { - super(val); - } - - public PhasingScore integrateReadScore(PhasingScore score) { - timesEqual(score); - return this; - } -} - -class HaplotypeClass implements Iterable { - private ArrayList haps; - private Haplotype rep; - - public HaplotypeClass(ArrayList haps, Haplotype rep) { - this.haps = haps; - this.rep = rep; - } - - public Iterator iterator() { - return haps.iterator(); - } - - public Haplotype getRepresentative() { - return rep; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - boolean isFirst = true; - for (Haplotype h : haps) { - if (isFirst) - isFirst = false; - else - sb.append(" + "); - - sb.append(h); - } - sb.append(" [").append(rep).append("]"); - return sb.toString(); - } -} - -class PhasingStats { - private int numReads; - private int numVarSites; - - // Map of: sample -> PhaseCounts: - private Map samplePhaseStats; - - public PhasingStats() { - this(new TreeMap()); - } - - public PhasingStats(int numReads, int numVarSites) { - this.numReads = numReads; - this.numVarSites = numVarSites; - this.samplePhaseStats = new TreeMap(); - } - - public PhasingStats(Map samplePhaseStats) { - this.numReads = 0; - this.numVarSites = 0; - this.samplePhaseStats = samplePhaseStats; - } - - public void addIn(PhasingStats other) { - this.numReads += other.numReads; - this.numVarSites += other.numVarSites; - - for (Map.Entry sampPhaseEntry : other.samplePhaseStats.entrySet()) { - String sample = sampPhaseEntry.getKey(); - PhaseCounts otherCounts = sampPhaseEntry.getValue(); - PhaseCounts thisCounts = this.samplePhaseStats.get(sample); - if (thisCounts == null) { - thisCounts = new PhaseCounts(); - this.samplePhaseStats.put(sample, thisCounts); - } - thisCounts.addIn(otherCounts); - } - } - - public int getNumReads() { - return numReads; - } - - public int getNumVarSites() { - return numVarSites; - } - - public Collection> getPhaseCounts() { - return samplePhaseStats.entrySet(); - } -} - -class PhaseCounts { - public int numTestedSites; // number of het sites directly succeeding het sites - public int numInconsistentSitesPhased; - public int numInconsistentSitesNotPhased; - public int numPhased; - - public PhaseCounts() { - this.numTestedSites = 0; - this.numInconsistentSitesPhased = 0; - this.numInconsistentSitesNotPhased = 0; - this.numPhased = 0; - } - - public void addIn(PhaseCounts other) { - this.numTestedSites += other.numTestedSites; - this.numInconsistentSitesPhased += other.numInconsistentSitesPhased; - this.numInconsistentSitesNotPhased += other.numInconsistentSitesNotPhased; - this.numPhased += other.numPhased; - } -} - -class PhasingStatsAndOutput { - public PhasingStats ps; - public List output; - - public PhasingStatsAndOutput(PhasingStats ps, List output) { - this.ps = ps; - this.output = output; - } -} - -class PhasingQualityStatsWriter { - private String variantStatsFilePrefix; - private HashMap sampleToStatsWriter = new HashMap(); - - public PhasingQualityStatsWriter(String variantStatsFilePrefix) { - this.variantStatsFilePrefix = variantStatsFilePrefix; - } - - public void addStat(String sample, GenomeLoc locus, int startDistanceFromPrevious, double phasingQuality, int numReads, int windowSize) { - BufferedWriter sampWriter = sampleToStatsWriter.get(sample); - if (sampWriter == null) { - String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ_numReads_windowSize.txt"; - - FileOutputStream output; - try { - output = new FileOutputStream(fileName); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to create phasing quality stats file at location: " + fileName); - } - sampWriter = new BufferedWriter(new OutputStreamWriter(output)); - sampleToStatsWriter.put(sample, sampWriter); - } - try { - sampWriter.write(locus + "\t" + startDistanceFromPrevious + "\t" + phasingQuality + "\t" + numReads + "\t" + windowSize + "\n"); - sampWriter.flush(); - } catch (IOException e) { - throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e); - } - } - - public void close() { - for (Map.Entry sampWriterEntry : sampleToStatsWriter.entrySet()) { - BufferedWriter sampWriter = sampWriterEntry.getValue(); - try { - sampWriter.flush(); - sampWriter.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close per-sample phasing quality stats file"); - } - } - } -} - -class SampleReadLocus implements Comparable { - private String sample; - private String read; - private GenomeLoc locus; - - public SampleReadLocus(String sample, String read, GenomeLoc locus) { - this.sample = sample; - this.read = read; - this.locus = locus; - } - - public GenomeLoc getLocus() { - return locus; - } - - public int compareTo(SampleReadLocus that) { - int comp = this.sample.compareTo(that.sample); - if (comp != 0) - return comp; - - comp = this.read.compareTo(that.read); - if (comp != 0) - return comp; - - return this.locus.compareTo(that.locus); - } - - public String toString() { - return "Sample " + sample + ", read " + read + ", locus " + locus; - } -} - -class MultipleBaseCounts { - private Map baseCounts; - private GenomeLoc phasingLocus; - - public MultipleBaseCounts(GenomeLoc phasingLoc) { - this.baseCounts = new HashMap(); - this.phasingLocus = phasingLoc; - } - - public boolean samePhasingLocAs(GenomeLoc loc) { - return phasingLocus.equals(loc); - } - - public void incrementBaseCount(byte base) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - Integer cnt = baseCounts.get(baseIndex); - if (cnt == null) - cnt = 0; - - baseCounts.put(baseIndex, cnt + 1); - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append("Base counts"); - for (Map.Entry baseCountEntry : baseCounts.entrySet()) { - byte base = BaseUtils.baseIndexToSimpleBase(baseCountEntry.getKey()); - int cnt = baseCountEntry.getValue(); - sb.append("\t" + (char) base + ": " + cnt); - } - - return sb.toString(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java deleted file mode 100644 index 2f8295008..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java +++ /dev/null @@ -1,175 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.HashSet; -import java.util.Set; - -/** - * Emits intervals present in either the original or reduced bam but not the other. - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals present in one bam but not the other. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedCoverage \
- *   -o output.intervals
- * 
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Hidden -public class AssessReducedCoverage extends LocusWalker implements TreeReducible { - - private static final String original = "original"; - private static final String reduced = "reduced"; - - @Output - protected PrintStream out; - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) - public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; - - public void initialize() {} - - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( tracker == null ) - return null; - - final Set tags = getAllTags(context.getBasePileup()); - return (tags.contains(original) && !tags.contains(reduced)) || - (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; - } - - private Set getAllTags(final ReadBackedPileup pileup) { - - final Set tags = new HashSet(10); - - for ( final PileupElement p : pileup ) { - if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) - tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); - } - - return tags; - } - - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java deleted file mode 100644 index 25f6f874d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ /dev/null @@ -1,208 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.List; - -/** - * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of - * the reduced bam are above sufficient threshold) - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedQuals \
- *   -o output.intervals
- * 
- * - * @author ami - */ -@Hidden -public class AssessReducedQuals extends LocusWalker implements TreeReducible { - - private static final String reduced = "reduced"; - private static final int originalQualsIndex = 0; - private static final int reducedQualsIndex = 1; - - @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) - public int sufficientQualSum = 600; - - @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) - public double qual_epsilon = 0.10; - - @Argument(fullName = "exclude_low_mq", shortName = "excludeMQ", doc = "ignore reads with mapping quality below this number", required = false) - public int excludeMQ = 0; - - @Output - protected PrintStream out; - - public void initialize() { - if ( qual_epsilon < 0.0 || qual_epsilon > 1.0 ) - throw new UserException.BadArgumentValue("qual_epsilon", "must be a number between 0 and 1"); - } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Override - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - boolean reportLocus; - final int[] quals = getPileupQuals(context.getBasePileup()); - final int epsilon = MathUtils.fastRound(quals[originalQualsIndex] * qual_epsilon); - final int calcOriginalQuals = Math.min(quals[originalQualsIndex], sufficientQualSum); - final int calcReducedQuals = Math.min(quals[reducedQualsIndex], sufficientQualSum); - final int originalReducedQualDiff = calcOriginalQuals - calcReducedQuals; - reportLocus = originalReducedQualDiff > epsilon || originalReducedQualDiff < -1 * epsilon; - - return reportLocus ? ref.getLocus() : null; - } - - /** - * Get the quals separated by version and strand - * @param readPileup the pileup - * @return 2x2 array with sum of quals separated by version in 1st dimension and strand in the 2nd - */ - private int[] getPileupQuals(final ReadBackedPileup readPileup) { - - final int[] quals = new int[2]; - - for ( final PileupElement p : readPileup ) { - final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); - if ( isGoodRead(p) ) { - final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); - final int tagIndex = getTagIndex(tags); - quals[tagIndex] += tempQual; - } - } - - return quals; - } - - private boolean isGoodRead(final PileupElement p) { - return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= excludeMQ; - } - - private int getTagIndex(final List tags) { - return tags.contains(reduced) ? 1 : 0; - } - - @Override - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - @Override - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - @Override - public GenomeLoc reduceInit() { - return null; - } - - @Override - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java deleted file mode 100644 index 63bd5f14d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 10, 2011 - */ - -public class Tranche { - private static final int CURRENT_VERSION = 5; - - public double ts, minVQSLod, knownTiTv, novelTiTv; - public int numKnown,numNovel; - public String name; - public VariantRecalibratorArgumentCollection.Mode model; - - int accessibleTruthSites = 0; - int callsAtTruthSites = 0; - - public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model) { - this(ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, model, "anonymous"); - } - - public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model, String name ) { - this.ts = ts; - this.minVQSLod = minVQSLod; - this.novelTiTv = novelTiTv; - this.numNovel = numNovel; - this.knownTiTv = knownTiTv; - this.numKnown = numKnown; - this.model = model; - this.name = name; - - this.accessibleTruthSites = accessibleTruthSites; - this.callsAtTruthSites = callsAtTruthSites; - - if ( ts < 0.0 || ts > 100.0) - throw new UserException("Target FDR is unreasonable " + ts); - - if ( numKnown < 0 || numNovel < 0) - throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel); - - if ( name == null ) - throw new ReviewedStingException("BUG -- name cannot be null"); - } - - private double getTruthSensitivity() { - return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; - } - - public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { - @Override - public int compare(final Tranche tranche1, final Tranche tranche2) { - return Double.compare(tranche1.ts, tranche2.ts); - } - } - - @Override - public String toString() { - return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", - ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); - } - - /** - * Returns an appropriately formatted string representing the raw tranches file on disk. - * - * @param tranches - * @return - */ - public static String tranchesString( final List tranches ) { - final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - final PrintStream stream = new PrintStream(bytes); - - Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); - - stream.println("# Variant quality score tranches file"); - stream.println("# Version number " + CURRENT_VERSION); - stream.println("targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity"); - - Tranche prev = null; - for ( Tranche t : tranches ) { - stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,VQSRTranche%s%.2fto%.2f,%s,%d,%d,%.4f%n", - t.ts, t.numKnown, t.numNovel, t.knownTiTv, t.novelTiTv, t.minVQSLod, t.model.toString(), - (prev == null ? 0.0 : prev.ts), t.ts, t.model.toString(), t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity()); - prev = t; - } - - return bytes.toString(); - } - - private static double getDouble(Map bindings, String key, boolean required) { - if ( bindings.containsKey(key) ) { - String val = bindings.get(key); - return Double.valueOf(val); - } - else if ( required ) { - throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); - } - else - return -1; - } - - private static int getInteger(Map bindings, String key, boolean required) { - if ( bindings.containsKey(key) ) - return Integer.valueOf(bindings.get(key)); - else if ( required ) { - throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); - } - else - return -1; - } - - /** - * Returns a list of tranches, sorted from most to least specific, read in from file f - * - * @param f - * @return - */ - public static List readTranches(File f) { - String[] header = null; - List tranches = new ArrayList(); - - try { - for( final String line : new XReadLines(f) ) { - if ( line.startsWith("#") ) - continue; - - final String[] vals = line.split(","); - if( header == null ) { - header = vals; - if ( header.length == 5 || header.length == 8 || header.length == 10 ) - // old style tranches file, throw an error - throw new UserException.MalformedFile(f, "Unfortunately your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator"); - if ( header.length != 11 ) - throw new UserException.MalformedFile(f, "Expected 11 elements in header line " + line); - } else { - if ( header.length != vals.length ) - throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + ". The line was: " + line); - - Map bindings = new HashMap(); - for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]); - tranches.add(new Tranche(getDouble(bindings,"targetTruthSensitivity", true), - getDouble(bindings,"minVQSLod", true), - getInteger(bindings,"numKnown", false), - getDouble(bindings,"knownTiTv", false), - getInteger(bindings,"numNovel", true), - getDouble(bindings,"novelTiTv", true), - getInteger(bindings,"accessibleTruthSites", false), - getInteger(bindings,"callsAtTruthSites", false), - VariantRecalibratorArgumentCollection.parseString(bindings.get("model")), - bindings.get("filterName"))); - } - } - - Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); - return tranches; - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(f, e); - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java deleted file mode 100644 index ac4654f73..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ /dev/null @@ -1,419 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 4, 2011 - */ - -public class VariantDataManager { - private List data; - private double[] meanVector; - private double[] varianceVector; // this is really the standard deviation - public List annotationKeys; - private final VariantRecalibratorArgumentCollection VRAC; - protected final static Logger logger = Logger.getLogger(VariantDataManager.class); - protected final List trainingSets; - - public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { - this.data = null; - this.annotationKeys = new ArrayList<>( annotationKeys ); - this.VRAC = VRAC; - meanVector = new double[this.annotationKeys.size()]; - varianceVector = new double[this.annotationKeys.size()]; - trainingSets = new ArrayList<>(); - } - - public void setData( final List data ) { - this.data = data; - } - - public List getData() { - return data; - } - - public void normalizeData() { - boolean foundZeroVarianceAnnotation = false; - for( int iii = 0; iii < meanVector.length; iii++ ) { - final double theMean = mean(iii, true); - final double theSTD = standardDeviation(theMean, iii, true); - logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); - if( Double.isNaN(theMean) ) { - throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpConstants.forumPost("discussion/49/using-variant-annotator")); - } - - foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-5); - meanVector[iii] = theMean; - varianceVector[iii] = theSTD; - for( final VariantDatum datum : data ) { - // Transform each data point via: (x - mean) / standard deviation - datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); - } - } - if( foundZeroVarianceAnnotation ) { - throw new UserException.BadInput( "Found annotations with zero variance. They must be excluded before proceeding." ); - } - - // trim data by standard deviation threshold and mark failing data for exclusion later - for( final VariantDatum datum : data ) { - boolean remove = false; - for( final double val : datum.annotations ) { - remove = remove || (Math.abs(val) > VRAC.STD_THRESHOLD); - } - datum.failingSTDThreshold = remove; - } - - // re-order the data by increasing standard deviation so that the results don't depend on the order things were specified on the command line - // standard deviation over the training points is used as a simple proxy for information content, perhaps there is a better thing to use here - final List theOrder = calculateSortOrder(meanVector); - annotationKeys = reorderList(annotationKeys, theOrder); - varianceVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(varianceVector), theOrder)); - meanVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(meanVector), theOrder)); - for( final VariantDatum datum : data ) { - datum.annotations = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.annotations), theOrder)); - datum.isNull = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.isNull), theOrder)); - } - logger.info("Annotations are now ordered by their information content: " + annotationKeys.toString()); - } - - /** - * Get a list of indices which give the ascending sort order of the data array - * @param inputVector the data to consider - * @return a non-null list of integers with length matching the length of the input array - */ - protected List calculateSortOrder(final double[] inputVector) { - final List theOrder = new ArrayList<>(inputVector.length); - final List toBeSorted = new ArrayList<>(inputVector.length); - int count = 0; - for( int iii = 0; iii < inputVector.length; iii++ ) { - toBeSorted.add(new MyDoubleForSorting(-1.0 * Math.abs(inputVector[iii] - mean(iii, false)), count++)); - } - Collections.sort(toBeSorted); - for( final MyDoubleForSorting d : toBeSorted ) { - theOrder.add(d.originalIndex); // read off the sort order by looking at the index field - } - return theOrder; - } - - // small private class to assist in reading off the new ordering of the annotation array - private class MyDoubleForSorting implements Comparable { - final Double myData; - final int originalIndex; - - public MyDoubleForSorting(final double myData, final int originalIndex) { - this.myData = myData; - this.originalIndex = originalIndex; - } - - @Override - public int compareTo(final MyDoubleForSorting other) { - return myData.compareTo(other.myData); - } - } - - /** - * Convenience connector method to work with arrays instead of lists. See ##reorderList## - */ - private T[] reorderArray(final T[] data, final List order) { - return reorderList(Arrays.asList(data), order).toArray(data); - } - - /** - * Reorder the given data list to be in the specified order - * @param data the data to reorder - * @param order the new order to use - * @return a reordered list of data - */ - private List reorderList(final List data, final List order) { - final List returnList = new ArrayList<>(data.size()); - for( final int index : order ) { - returnList.add( data.get(index) ); - } - return returnList; - } - - /** - * Convert a normalized point to it's original annotation value - * - * norm = (orig - mu) / sigma - * orig = norm * sigma + mu - * - * @param normalizedValue the normalized value of the ith annotation - * @param annI the index of the annotation value - * @return the denormalized value for the annotation - */ - public double denormalizeDatum(final double normalizedValue, final int annI) { - final double mu = meanVector[annI]; - final double sigma = varianceVector[annI]; - return normalizedValue * sigma + mu; - } - - public void addTrainingSet( final TrainingSet trainingSet ) { - trainingSets.add( trainingSet ); - } - - public List getAnnotationKeys() { - return annotationKeys; - } - - public boolean checkHasTrainingSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTraining ) { return true; } - } - return false; - } - - public boolean checkHasTruthSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTruth ) { return true; } - } - return false; - } - - public List getTrainingData() { - final List trainingData = new ExpandingArrayList<>(); - for( final VariantDatum datum : data ) { - if( datum.atTrainingSite && !datum.failingSTDThreshold ) { - trainingData.add( datum ); - } - } - logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." ); - if( trainingData.size() < VRAC.MIN_NUM_BAD_VARIANTS ) { - logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); - } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) { - logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." ); - Collections.shuffle(trainingData); - return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA); - } - return trainingData; - } - - public List selectWorstVariants() { - final List trainingData = new ExpandingArrayList<>(); - - for( final VariantDatum datum : data ) { - if( datum != null && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) && datum.lod < VRAC.BAD_LOD_CUTOFF ) { - datum.atAntiTrainingSite = true; - trainingData.add( datum ); - } - } - - logger.info( "Training with worst " + trainingData.size() + " scoring variants --> variants with LOD <= " + String.format("%.4f", VRAC.BAD_LOD_CUTOFF) + "." ); - - return trainingData; - } - - public List getEvaluationData() { - final List evaluationData = new ExpandingArrayList<>(); - - for( final VariantDatum datum : data ) { - if( datum != null && !datum.failingSTDThreshold && !datum.atTrainingSite && !datum.atAntiTrainingSite ) { - evaluationData.add( datum ); - } - } - - return evaluationData; - } - - public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { - final List returnData = new ExpandingArrayList<>(); - Collections.shuffle(trainingData); - Collections.shuffle(antiTrainingData); - Collections.shuffle(evaluationData); - returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size()))); - returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size()))); - returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size()))); - Collections.shuffle(returnData); - return returnData; - } - - protected double mean( final int index, final boolean trainingData ) { - double sum = 0.0; - int numNonNull = 0; - for( final VariantDatum datum : data ) { - if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; } - } - return sum / ((double) numNonNull); - } - - protected double standardDeviation( final double mean, final int index, final boolean trainingData ) { - double sum = 0.0; - int numNonNull = 0; - for( final VariantDatum datum : data ) { - if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; } - } - return Math.sqrt( sum / ((double) numNonNull) ); - } - - public void decodeAnnotations( final VariantDatum datum, final VariantContext vc, final boolean jitter ) { - final double[] annotations = new double[annotationKeys.size()]; - final boolean[] isNull = new boolean[annotationKeys.size()]; - int iii = 0; - for( final String key : annotationKeys ) { - isNull[iii] = false; - annotations[iii] = decodeAnnotation( key, vc, jitter ); - if( Double.isNaN(annotations[iii]) ) { isNull[iii] = true; } - iii++; - } - datum.annotations = annotations; - datum.isNull = isNull; - } - - private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) { - double value; - - try { - value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); - if( Double.isInfinite(value) ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - } catch( Exception e ) { - value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model - } - - return value; - } - - public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { - datum.isKnown = false; - datum.atTruthSite = false; - datum.atTrainingSite = false; - datum.atAntiTrainingSite = false; - datum.prior = 2.0; - - for( final TrainingSet trainingSet : trainingSets ) { - for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.isKnown = datum.isKnown || trainingSet.isKnown; - datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; - datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; - datum.prior = Math.max( datum.prior, trainingSet.prior ); - datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 ); - } - if( trainVC != null ) { - datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; - } - } - } - } - - private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { - return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); - } - - protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) { - switch( trainVC.getType() ) { - case SNP: - case MNP: - return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.SNP ); - case INDEL: - case MIXED: - case SYMBOLIC: - return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); - default: - return false; - } - } - - protected static boolean checkVariationClass( final VariantContext evalVC, final VariantRecalibratorArgumentCollection.Mode mode ) { - switch( mode ) { - case SNP: - return evalVC.isSNP() || evalVC.isMNP(); - case INDEL: - return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); - case BOTH: - return true; - default: - throw new ReviewedStingException( "Encountered unknown recal mode: " + mode ); - } - } - - public void writeOutRecalibrationTable( final VariantContextWriter recalWriter ) { - // we need to sort in coordinate order in order to produce a valid VCF - Collections.sort( data, new Comparator() { - public int compare(VariantDatum vd1, VariantDatum vd2) { - return vd1.loc.compareTo(vd2.loc); - }} ); - - // create dummy alleles to be used - final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); - - for( final VariantDatum datum : data ) { - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); - builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); - - if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); - if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); - - recalWriter.add(builder.make()); - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java deleted file mode 100644 index 905c97df4..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ /dev/null @@ -1,85 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.io.Serializable; -import java.util.Comparator; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 4, 2011 - */ - -public class VariantDatum { - - public double[] annotations; - public boolean[] isNull; - public boolean isKnown; - public double lod; - public boolean atTruthSite; - public boolean atTrainingSite; - public boolean atAntiTrainingSite; - public boolean isTransition; - public boolean isSNP; - public boolean failingSTDThreshold; - public double originalQual; - public double prior; - public int consensusCount; - public GenomeLoc loc; - public int worstAnnotation; - public MultivariateGaussian assignment; // used in K-means implementation - - public static class VariantDatumLODComparator implements Comparator, Serializable { - @Override - public int compare(final VariantDatum datum1, final VariantDatum datum2) { - return Double.compare(datum1.lod, datum2.lod); - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java deleted file mode 100644 index 1c32b852b..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ /dev/null @@ -1,533 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.PartitionBy; -import org.broadinstitute.sting.gatk.walkers.PartitionType; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.R.RScriptExecutor; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.io.Resource; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. - * - *

- * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. - *

- * - *

- * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. - * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. - * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic - * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided - * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive - * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the - * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is - * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. - *

- * - *

Inputs

- *

- * The input raw variants to be recalibrated. - *

- * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. - * - *

Output

- *

- * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. - *

- * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. - * - *

Example

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T VariantRecalibrator \
- *   -R reference/human_g1k_v37.fasta \
- *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
- *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
- *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
- *   -mode SNP \
- *   -recalFile path/to/output.recal \
- *   -tranchesFile path/to/output.tranches \
- *   -rscriptFile path/to/output.plots.R
- * 
- * - *

Caveat

- * - *
    - *
  • The values used in the example above are only meant to show how the command lines are composed. - * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be - * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on - * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.
  • - * - *
  • In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). - * See http://www.r-project.org for more info on how to download and install R.
  • - *
- */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.NONE) -public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { - - public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model - public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out - public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set - public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set - private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; - - @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); - - ///////////////////////////// - // Inputs - ///////////////////////////// - /** - * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling. - */ - @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) - public List> input; - - /** - * Any set of VCF files to use as lists of training, truth, or known sites. - * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. - * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used. - * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes. - * Bad - In addition to using the set of worst ranked variants as compared to the Gaussian mixture model (see -numBad argument), we can also supplement the list with a database of known bad variants. - */ - @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)", required=true) - public List> resource = Collections.emptyList(); - - ///////////////////////////// - // Outputs - ///////////////////////////// - @Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) - protected VariantContextWriter recalWriter = null; - - @Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true) - protected File TRANCHES_FILE; - - ///////////////////////////// - // Additional Command Line Arguments - ///////////////////////////// - /** - * The expected transition / transversion ratio of true novel variants in your targeted region (whole genome, exome, specific - * genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best - * practices documentation (http://www.broadinstitute.org/gatk/guide/best-practices) for more information. - * Normal values are 2.15 for human whole genome values and 3.2 for human whole exomes. Note - * that this parameter is used for display purposes only and isn't used anywhere in the algorithm! - */ - @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false) - protected double TARGET_TITV = 2.15; - - /** - * See the input VCF file's INFO field for a list of all available annotations. - */ - @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true) - private String[] USE_ANNOTATIONS = null; - - /** - * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0 - * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible - * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%. - */ - @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) - private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0}; - /** - * For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command. - */ - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the variant recalibrator will also use variants marked as filtered by the specified filter name in the input VCF file", required=false) - private String[] IGNORE_INPUT_FILTERS = null; - @Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false) - private File RSCRIPT_FILE = null; - - @Hidden - @Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false) - protected int REPLICATE = 200; - private ArrayList replicate = new ArrayList<>(); - - ///////////////////////////// - // Debug Arguments - ///////////////////////////// - @Advanced - @Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false) - protected Boolean TRUST_ALL_POLYMORPHIC = false; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - private VariantDataManager dataManager; - private PrintStream tranchesStream; - private final Set ignoreInputFilterSet = new TreeSet<>(); - private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC ); - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public void initialize() { - dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC ); - - if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) - Utils.warnUser(logger, String.format( - "Rscript not found in environment path. %s will be generated but PDF plots will not.", - RSCRIPT_FILE)); - - if( IGNORE_INPUT_FILTERS != null ) { - ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); - } - - try { - tranchesStream = new PrintStream(TRANCHES_FILE); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e); - } - - for( RodBinding rod : resource ) { - dataManager.addTrainingSet( new TrainingSet( rod ) ); - } - - if( !dataManager.checkHasTrainingSet() ) { - throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); - } - if( !dataManager.checkHasTruthSet() ) { - throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); - } - - - final Set hInfo = new HashSet<>(); - ApplyRecalibration.addVQSRStandardHeaderLines(hInfo); - recalWriter.writeHeader( new VCFHeader(hInfo) ); - - for( int iii = 0; iii < REPLICATE * 2; iii++ ) { - replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble()); - } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public ExpandingArrayList map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - final ExpandingArrayList mapList = new ExpandingArrayList<>(); - - if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers - return mapList; - } - - for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) { - if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) { - if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) { - final VariantDatum datum = new VariantDatum(); - - // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need. - dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine - datum.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vc); - datum.originalQual = vc.getPhredScaledQual(); - datum.isSNP = vc.isSNP() && vc.isBiallelic(); - datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc); - - // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately - dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC ); - final double priorFactor = QualityUtils.qualToProb( datum.prior ); - datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor ); - - mapList.add( datum ); - } - } - } - - return mapList; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public ExpandingArrayList reduceInit() { - return new ExpandingArrayList<>(); - } - - @Override - public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { - reduceSum.addAll( mapValue ); - return reduceSum; - } - - @Override - public ExpandingArrayList treeReduce( final ExpandingArrayList lhs, final ExpandingArrayList rhs ) { - rhs.addAll( lhs ); - return rhs; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // on traversal done - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public void onTraversalDone( final ExpandingArrayList reduceSum ) { - dataManager.setData( reduceSum ); - dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation - - // Generate the positive model using the training data and evaluate each variant - final List positiveTrainingData = dataManager.getTrainingData(); - final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS ); - engine.evaluateData( dataManager.getData(), goodModel, false ); - - // Generate the negative model using the worst performing data and evaluate each variant contrastively - final List negativeTrainingData = dataManager.selectWorstVariants(); - final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS)); - engine.evaluateData( dataManager.getData(), badModel, true ); - - if( badModel.failedToConverge || goodModel.failedToConverge ) { - throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") ); - } - - engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); - - // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user - final int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY ); - final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth ); - final List tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE ); - tranchesStream.print(Tranche.tranchesString( tranches )); - - logger.info( "Writing out recalibration table..." ); - dataManager.writeOutRecalibrationTable( recalWriter ); - if( RSCRIPT_FILE != null ) { - logger.info( "Writing out visualization Rscript file..."); - createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) ); - } - - if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) { - // Print out an info message to make it clear why the tranches plot is not generated - logger.info("Tranches plot will not be generated since we are running in INDEL mode"); - } else { - // Execute the RScript command to plot the table of truth values - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class)); - executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV); - // Print out the command line to make it clear to the user what is being executed and how one might modify it - logger.info("Executing: " + executor.getApproximateCommandLine()); - executor.exec(); - } - } - - private void createVisualizationScript( final List randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) { - PrintStream stream; - try { - stream = new PrintStream(RSCRIPT_FILE); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, e); - } - - // We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/ - stream.println("library(ggplot2)"); - // For compactPDF in R 2.13+ - stream.println("library(tools)"); - // For graphical functions R 2.14.2+ - stream.println("library(grid)"); - - createArrangeFunction( stream ); - - stream.println("outputPDF <- \"" + RSCRIPT_FILE + ".pdf\""); - stream.println("pdf(outputPDF)"); // Unfortunately this is a huge pdf file, BUGBUG: need to work on reducing the file size - - for(int iii = 0; iii < annotationKeys.length; iii++) { - for( int jjj = iii + 1; jjj < annotationKeys.length; jjj++) { - logger.info( "Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot..."); - - final List fakeData = new ExpandingArrayList<>(); - double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0; - for( final VariantDatum datum : randomData ) { - minAnn1 = Math.min(minAnn1, datum.annotations[iii]); - maxAnn1 = Math.max(maxAnn1, datum.annotations[iii]); - minAnn2 = Math.min(minAnn2, datum.annotations[jjj]); - maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]); - } - // Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D - final double NUM_STEPS = 60.0; - for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+= (maxAnn1 - minAnn1) / NUM_STEPS) { - for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+= (maxAnn2 - minAnn2) / NUM_STEPS) { - final VariantDatum datum = new VariantDatum(); - datum.prior = 0.0; - datum.annotations = new double[randomData.get(0).annotations.length]; - datum.isNull = new boolean[randomData.get(0).annotations.length]; - for(int ann=0; ann< datum.annotations.length; ann++) { - datum.annotations[ann] = 0.0; - datum.isNull[ann] = true; - } - datum.annotations[iii] = ann1; - datum.annotations[jjj] = ann2; - datum.isNull[iii] = false; - datum.isNull[jjj] = false; - fakeData.add(datum); - } - } - - engine.evaluateData( fakeData, goodModel, false ); - engine.evaluateData( fakeData, badModel, true ); - - stream.print("surface <- c("); - for( final VariantDatum datum : fakeData ) { - stream.print(String.format("%.4f, %.4f, %.4f, ", - dataManager.denormalizeDatum(datum.annotations[iii], iii), - dataManager.denormalizeDatum(datum.annotations[jjj], jjj), - Math.min(4.0, Math.max(-4.0, datum.lod)))); - } - stream.println("NA,NA,NA)"); - stream.println("s <- matrix(surface,ncol=3,byrow=T)"); - - stream.print("data <- c("); - for( final VariantDatum datum : randomData ) { - stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,", - dataManager.denormalizeDatum(datum.annotations[iii], iii), - dataManager.denormalizeDatum(datum.annotations[jjj], jjj), - (datum.lod < lodCutoff ? -1.0 : 1.0), - (datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1))); - } - stream.println("NA,NA,NA,NA,1)"); - stream.println("d <- matrix(data,ncol=5,byrow=T)"); - - final String surfaceFrame = "sf." + annotationKeys[iii] + "." + annotationKeys[jjj]; - final String dataFrame = "df." + annotationKeys[iii] + "." + annotationKeys[jjj]; - - stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])"); - stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])"); - stream.println("dummyData <- " + dataFrame + "[1,]"); - stream.println("dummyData$x <- NaN"); - stream.println("dummyData$y <- NaN"); - stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")"); - stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)"); - stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))"); - stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)"); - stream.println("p3 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))"); - stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)"); - stream.println("p4 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))"); - stream.println("arrange(p1, p2, p3, p4, ncol=2)"); - } - } - stream.println("dev.off()"); - - stream.println("if (exists(\"compactPDF\")) {"); - stream.println("compactPDF(outputPDF)"); - stream.println("}"); - - stream.close(); - - // Execute Rscript command to generate the clustering plots - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(RSCRIPT_FILE); - logger.info("Executing: " + executor.getApproximateCommandLine()); - executor.exec(); - } - - // The Arrange function is how we place the 4 model plots on one page - // from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html - private void createArrangeFunction( final PrintStream stream ) { - stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)"); - stream.println("arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {"); - stream.println("dots <- list(...)"); - stream.println("n <- length(dots)"); - stream.println("if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}"); - stream.println("if(is.null(nrow)) { nrow = ceiling(n/ncol)}"); - stream.println("if(is.null(ncol)) { ncol = ceiling(n/nrow)}"); - stream.println("grid.newpage()"); - stream.println("pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )"); - stream.println("ii.p <- 1"); - stream.println("for(ii.row in seq(1, nrow)){"); - stream.println("ii.table.row <- ii.row "); - stream.println("if(as.table) {ii.table.row <- nrow - ii.table.row + 1}"); - stream.println("for(ii.col in seq(1, ncol)){"); - stream.println("ii.table <- ii.p"); - stream.println("if(ii.p > n) break"); - stream.println("print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))"); - stream.println("ii.p <- ii.p + 1"); - stream.println("}"); - stream.println("}"); - stream.println("}"); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java deleted file mode 100644 index b501655f8..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java +++ /dev/null @@ -1,120 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 4, 2011 - */ - -public class VariantRecalibratorArgumentCollection { - - public enum Mode { - SNP, - INDEL, - BOTH - } - - static Mode parseString(final String input) { - if( input.equals("SNP") ) { return Mode.SNP; } - if( input.equals("INDEL") ) { return Mode.INDEL; } - if( input.equals("BOTH") ) { return Mode.BOTH; } - throw new ReviewedStingException("VariantRecalibrator mode string is unrecognized, input = " + input); - } - - @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels (emitting SNPs untouched in the output VCF); and 3.) BOTH for recalibrating both SNPs and indels simultaneously (for testing purposes only, not recommended for general use).", required = false) - public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; - - @Advanced - @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians for the positive model to try during variational Bayes algorithm.", required=false) - public int MAX_GAUSSIANS = 8; - - @Advanced - @Argument(fullName="maxNegativeGaussians", shortName="mNG", doc="The maximum number of Gaussians for the negative model to try during variational Bayes algorithm. The actual maximum used is the min of the mG and mNG arguments. Note that this number should be small (like 4) to achieve the best results", required=false) - public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 2; - - @Advanced - @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.", required=false) - public int MAX_ITERATIONS = 150; - - @Advanced - @Argument(fullName="numKMeans", shortName="nKM", doc="The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.", required=false) - public int NUM_KMEANS_ITERATIONS = 100; - - @Advanced - @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.", required=false) - public double STD_THRESHOLD = 10.0; - - @Advanced - @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false) - public double SHRINKAGE = 1.0; - - @Advanced - @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false) - public double DIRICHLET_PARAMETER = 0.001; - - @Advanced - @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false) - public double PRIOR_COUNTS = 20.0; - - @Advanced - @Argument(fullName="maxNumTrainingData", shortName="maxNumTrainingData", doc="Maximum number of training data to be used in building the Gaussian mixture model. Training sets large than this will be randomly downsampled.", required=false) - protected int MAX_NUM_TRAINING_DATA = 2500000; - - @Advanced - @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false) - public int MIN_NUM_BAD_VARIANTS = 1000; - - @Advanced - @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false) - public double BAD_LOD_CUTOFF = -5.0; -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java deleted file mode 100644 index 3828e6e20..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ /dev/null @@ -1,169 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; - -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 4, 2011 - */ - -public class VariantRecalibratorEngine { - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - - protected final static Logger logger = Logger.getLogger(VariantRecalibratorEngine.class); - public final static double MIN_ACCEPTABLE_LOD_SCORE = -20000.0; - - // the unified argument collection - final private VariantRecalibratorArgumentCollection VRAC; - - private final static double MIN_PROB_CONVERGENCE = 2E-3; - - ///////////////////////////// - // Public Methods to interface with the Engine - ///////////////////////////// - - public VariantRecalibratorEngine( final VariantRecalibratorArgumentCollection VRAC ) { - this.VRAC = VRAC; - } - - public GaussianMixtureModel generateModel( final List data, final int maxGaussians ) { - final GaussianMixtureModel model = new GaussianMixtureModel( maxGaussians, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS ); - variationalBayesExpectationMaximization( model, data ); - return model; - } - - public void evaluateData( final List data, final GaussianMixtureModel model, final boolean evaluateContrastively ) { - if( !model.isModelReadyForEvaluation ) { - try { - model.precomputeDenominatorForEvaluation(); - } catch( Exception e ) { - model.failedToConverge = true; - return; - } - } - - logger.info("Evaluating full set of " + data.size() + " variants..."); - for( final VariantDatum datum : data ) { - final double thisLod = evaluateDatum( datum, model ); - if( Double.isNaN(thisLod) ) { - model.failedToConverge = true; - return; - } - - datum.lod = ( evaluateContrastively ? - ( Double.isInfinite(datum.lod) ? // positive model said negative infinity - ( MIN_ACCEPTABLE_LOD_SCORE + GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE ) // Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians - : datum.prior + datum.lod - thisLod) // contrastive evaluation: (prior + positive model - negative model) - : thisLod ); // positive model only so set the lod and return - } - } - - public void calculateWorstPerformingAnnotation( final List data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) { - for( final VariantDatum datum : data ) { - int worstAnnotation = -1; - double minProb = Double.MAX_VALUE; - for( int iii = 0; iii < datum.annotations.length; iii++ ) { - final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii); - final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii); - if( goodProbLog10 != null && badProbLog10 != null ) { - final double prob = goodProbLog10 - badProbLog10; - if(prob < minProb) { minProb = prob; worstAnnotation = iii; } - } - } - datum.worstAnnotation = worstAnnotation; - } - } - - - ///////////////////////////// - // Private Methods used for generating a GaussianMixtureModel - ///////////////////////////// - - private void variationalBayesExpectationMaximization( final GaussianMixtureModel model, final List data ) { - - model.initializeRandomModel( data, VRAC.NUM_KMEANS_ITERATIONS ); - - // The VBEM loop - model.normalizePMixtureLog10(); - model.expectationStep( data ); - double currentChangeInMixtureCoefficients; - int iteration = 0; - logger.info("Finished iteration " + iteration + "."); - while( iteration < VRAC.MAX_ITERATIONS ) { - iteration++; - model.maximizationStep( data ); - currentChangeInMixtureCoefficients = model.normalizePMixtureLog10(); - model.expectationStep( data ); - if( iteration % 5 == 0 ) { // cut down on the number of output lines so that users can read the warning messages - logger.info("Finished iteration " + iteration + ". \tCurrent change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients)); - } - if( iteration > 2 && currentChangeInMixtureCoefficients < MIN_PROB_CONVERGENCE ) { - logger.info("Convergence after " + iteration + " iterations!"); - break; - } - } - - model.evaluateFinalModelParameters( data ); - } - - ///////////////////////////// - // Private Methods used for evaluating data given a GaussianMixtureModel - ///////////////////////////// - - private double evaluateDatum( final VariantDatum datum, final GaussianMixtureModel model ) { - return model.evaluateDatum( datum ); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java b/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java deleted file mode 100644 index e1de32bf6..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java +++ /dev/null @@ -1,521 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.utils.collections; - -import com.google.java.contract.Requires; -import com.sun.istack.internal.NotNull; - -import java.lang.reflect.Array; -import java.util.*; - -/** - * Efficient implementation for a small set of integer primitive values. - *

- * It includes a increment operation incAll which is convenient when analyzing the read-threading graphs. Nevertheless - * it can be also be used in general purpose. - *

- *

- * It does not provide a O(1) look-up of its elements though. These are kept in a sorted array so look up is implemented - * using a binary search O(log n). Therefore it might not be optimal for problems that require large integer sets. - *

- *

- * Also note that addition can be costly for large sets unless done in order: O(n). - *

- * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class CountSet implements Cloneable, Set { - - /** - * The size of the set. - */ - private int size; - - /** - * Holds the element of the set within the subrange [0 .. size - 1] in ascending order. - */ - private int[] elements; - - /** - * Creates a copy of an existing int-set. - * @param template the intset to copy values from. - */ - public CountSet(final CountSet template) { - elements = template.elements.clone(); - size = template.size; - } - - /** - * Creates a new set indicating the expected maximum number of elements it will contain. - * @param initialCapacity the desired initial capacity of the set. - * @throws IllegalArgumentException if initialCapacity is negative. - */ - public CountSet(int initialCapacity) { - if (initialCapacity < 0) - throw new IllegalArgumentException(); - elements = new int[initialCapacity]; - size = 0; - } - - /** - * Set the set contents to a single integer value. - * @param value the integer value to set the set to. - */ - public void setTo(int value) { - ensureCapacity(1); - size = 1; - elements[0] = value; - } - - /** - * Set the content of this set to a collection of integers. - * @param values the new values to be included in the set. - * @throws NullPointerException if value is null. - */ - public void setTo(int ... values) { - ensureCapacity(values.length); - size = values.length; - System.arraycopy(values, 0, elements, 0, size); - Arrays.sort(elements,0,size); - } - - /** - * Increase (or decrease) all elements in the set by a number. - * @param delta the number of add (or substract if negative) to all elements. - * - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean incAll(final int delta) { - if (size == 0 || delta == 0) - return false; - for (int i = 0; i < size; i++) - elements[i] += delta; - return true; - } - - /** - * Returns the smallest integer value in the set. - * - * @throws NoSuchElementException if the set is empty (thus there is no minimum). - * @return the smallest integer value in the set. - */ - public int min() { - if (size == 0) - throw new NoSuchElementException("cannot have a min from an empty set"); - return elements[0]; - } - - /** - * Returns the largest integer value in the set. - * - * @throws NoSuchElementException if the set is empty (thus there is no maximum). - * @return the largest integer value in the set. - */ - public int max() { - if (size == 0) - throw new NoSuchElementException("cannot have a max from an empty set"); - return elements[size - 1]; - } - - /** - * Adds a range of integer values to the collection. - * - * This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive. - * You can indicate a decrease range (fromValue > toValue). - * - * @param fromValue the first value to add in the set (inclusive). - * @param toValue the last value to add to the set (inclusive). - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean addRange(final int fromValue, final int toValue) { - final int lowEnd; - final int highEnd; - - if (fromValue <= toValue) { - lowEnd = fromValue; highEnd = toValue; - } else { - highEnd = fromValue; lowEnd = toValue; - } - - //TODO to be optimized to add missing sub-ranges in one go: - boolean result = false; - for (int i = lowEnd; i <= highEnd; i++) - result = add(i) | result; - return result; - } - - /** - * Add an integer value to the set. - * @param value to add to the set. - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean add(final int value) { - int pos = Arrays.binarySearch(elements,0,size,value); - if (pos >= 0) return false; - int insertPos = - pos - 1; - ensureCapacity(size + 1); - System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos); - elements[insertPos] = value; - size++; - return true; - } - - /** - * Add a arbitrary number of integers to the set. - * - * @param values integer to add to the set. - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean addAll(final int ... values) { - ensureCapacity(size + values.length); - boolean result = false; - for (final int v : values) - result = add(v) | result; - return result; - } - - @Override - public boolean addAll(final Collection numbers) { - ensureCapacity(size + numbers.size()); - boolean result = false; - for (final Number n : numbers) - result = add(n.intValue()) | result; - return result; - } - - /** - * Add all values within a range in an integer array. - * - * @param source array where the values to add are found. - * @param fromIndex first position from source to add (inclusive). - * @param toIndex index after the last position in source to add (thus exclusive). - * @throws NullPointerException if source is null. - * @throws NegativeArraySizeException if fromIndex or toIndex are negative. - * @throws ArrayIndexOutOfBoundsException if fromIndex or toIndex are beyond bounds - * allowed [0 .. source.length]. - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean addAll(final int[] source, final int fromIndex, final int toIndex) { - ensureCapacity(size + source.length); - boolean result = false; - for (int i = fromIndex; i < toIndex; i++) - result = add(source[i]) | result; - return result; - } - - - /** - * Add all elements present in a int-set. - * - * @param other the other inset. - * - * @throws NullPointerException if other is null. - * @return true if this set changed due to this operation, false otherwise. - */ - public boolean addAll(final CountSet other) { - return addAll(other.elements,0,other.size); - } - - /** - * Checks whether a integer value is included in the set. - * @param value the value to check. - * @return true if value is inside the set, false otherwise. - */ - public boolean contains(final int value) { - return Arrays.binarySearch(elements,0,size,value) >= 0; - } - - /** - * Make sure that this int-set has capacity to handle a number of elements. - *

- * If the set has already that or greater capacity nothing would be changed. - * - * @param capacity the requested capacity. - */ - private void ensureCapacity(final int capacity) { - if (elements.length >= capacity) return; - int newLength = Math.max(elements.length << 1, capacity); - elements = Arrays.copyOf(elements,newLength); - } - - - @Override - public int size() { - return size; - } - - @Override - public boolean isEmpty() { - return size() == 0; - } - - @Override - public boolean contains(final Object o) { - if (o instanceof Integer) { - final int i = (Integer)o; - return contains(i); - } else - return false; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - @NotNull - public Iterator iterator() { - return new MyIterator(); - } - - @Override - @NotNull - public Object[] toArray() { - final Integer[] result = new Integer[size]; - for (int i = 0; i < size; i++) - result[i] = elements[i]; - return result; - } - - @Override - @NotNull - @SuppressWarnings("unchecked") - public T[] toArray(final T[] a) { - if (a == null) - throw new NullPointerException(); - - @SuppressWarnings("unchecked") - final Class componentClass = (Class) a.getClass().getComponentType(); - if (!componentClass.isAssignableFrom(Integer.class)) - throw new ArrayStoreException(); - - @SuppressWarnings("unchecked") - final T[] dest = (a.length < size) ? (T[]) (Object[]) Array.newInstance(componentClass, size) : a; - - for (int i = 0; i < size; i++) - dest[i] = (T) (Integer) elements[i]; - return dest; - } - - /** - * Copies the content of the set into an integer array. The result can be freely modified by the invoker. - * @return never null but a zero-length array if the set is empty. - */ - @NotNull - public int[] toIntArray() { - return Arrays.copyOfRange(elements,0,size); - } - - /** - * Copy the content of the set into an array. - * @param dest the destination array. - * @param offset where to store the first element of the set. - * @throws NullPointerException if dest is null. - * @throws ArrayIndexOutOfBoundsException if offset is out of range of there is not enough - * space after offset in the destination array to hold all values in the set. - */ - public void copyTo(final int[] dest, int offset) { - if (dest == null) - throw new NullPointerException(); - if (dest.length < (size + offset)) - throw new ArrayIndexOutOfBoundsException("destination is to short"); - System.arraycopy(elements,0,dest,offset,size); - } - - /** - * Copy the content of the set into an array. - * @param dest the destination array. - * @throws NullPointerException if dest is null. - * @throws ArrayIndexOutOfBoundsException if there is not enough - * space after offset in the destination array to hold all values in the set. - */ - public void copyTo(final int[] dest) { - copyTo(dest,0); - } - - - @Override - public boolean add(final Integer integer) { - return add((int) integer); - } - - @Override - public boolean remove(final Object o) { - return o instanceof Integer && remove((int)o); - } - - /** - * Removes a single integer value for the set. - * @param i the value to remove. - * @return true if the set has changed as a result of this invocation, false otherwise. - */ - public boolean remove(final int i) { - final int pos = Arrays.binarySearch(elements,0,size,i); - if (pos < 0) - return false; - else { - removeIndex(pos); - return true; - } - } - - @Override - public boolean containsAll(final Collection c) { - for (final Object o : c) - if (!contains(o)) - return false; - return true; - } - - - @Override - public boolean retainAll(final Collection c) { - if (size == 0) - return false; - @SuppressWarnings("all") - final CountSet retainIndices = new CountSet(c.size() + 2); - retainIndices.add(-1); - retainIndices.add(size); - for (final Object o : c) { - if (!(o instanceof Integer)) - continue; - final int pos = Arrays.binarySearch(elements,0,size,(int) o); - if (pos < 0) - continue; - retainIndices.add(pos); - } - if (retainIndices.size == 2) { - size = 0; - return true; - } else if (retainIndices.size == size + 2) { - return false; - } else { - for (int idx = retainIndices.size - 1; idx > 0; idx--) { - final int toIdx = retainIndices.elements[idx]; - final int fromIdx = retainIndices.elements[idx - 1] + 1; - removeIndices(toIdx,fromIdx); - } - return true; - } - } - - /** - * Removes the values found in a range of indexes in {@link #elements}. - * @param fromIdx first index to remove (inclusive). - * @param toIdx right after last index to remove (exclusive). - */ - @Requires("fromIdx >= toIdx & fromIdx >= 0 & toIdx <= size") - private void removeIndices(final int fromIdx, final int toIdx) { - System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx); - size -= toIdx - fromIdx; - } - - @Override - public boolean removeAll(final Collection c) { - boolean result = false; - for (final Object o : c) - result = remove(o) | result; - return result; - } - - @Requires("idx >= 0 && idx < size") - private void removeIndex(int idx) { - System.arraycopy(elements,idx+1,elements,idx,size - idx - 1); - } - - @Override - public void clear() { - size = 0; - } - - /** - * Returns a copy of this set which can be changed without modifying the original one. - * @return never {@code null}. - */ - @NotNull - @SuppressWarnings("all") - public CountSet clone() { - return new CountSet(this); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(2 + size() * 10); - sb.append('{'); - for (int i = 0; i < size; i++) - sb.append(elements[i]).append(','); - sb.replace(sb.length()-1,sb.length(),"}"); - return sb.toString(); - - } - - - /** - * Custom iterator class for {@link CountSet IntSets} - */ - private class MyIterator implements Iterator { - /** What position I am in. */ - private int next = 0; - - @Override - public boolean hasNext() { - return next < size; - } - - @Override - public Integer next() { - if (next >= size) - throw new NoSuchElementException(); - return elements[next]; - } - - @Override - public void remove() { - if (next == 0) - throw new IllegalStateException(); - if (next >= size) - throw new NoSuchElementException(); - removeIndex(next - 1); - } - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java deleted file mode 100644 index 98aedf786..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ /dev/null @@ -1,302 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.gvcf; - -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.*; - -import java.util.*; - -/** - * Genome-wide VCF writer - * - * User: depristo - * Date: 6/24/13 - * Time: 2:51 PM - */ -public class GVCFWriter implements VariantContextWriter { - // - // static VCF field names - // - protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE"; - protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; - protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ"; - - // - // Final fields initialized in constructor - // - /** Where we'll ultimately write our VCF records */ - final private VariantContextWriter underlyingWriter; - - final private List GQPartitions; - - /** fields updated on the fly during GVCFWriter operation */ - int nextAvailableStart = -1; - private String sampleName = null; - private HomRefBlock currentBlock = null; - - /** - * Is the proposed GQ partitions well-formed? - * - * @param GQPartitions proposed GQ partitions - * @return a non-null string if something is wrong (string explains issue) - */ - protected static List parsePartitions(final List GQPartitions) { - if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); - if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); - - final List result = new LinkedList<>(); - int lastThreshold = 0; - for ( final Integer value : GQPartitions ) { - if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); - if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); - if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); - result.add(new HomRefBlock(lastThreshold, value)); - lastThreshold = value; - } - result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE)); - - return result; - } - - /** - * Create a new GVCF writer - * - * Should be a non-empty list of boundaries. For example, suppose this variable is - * - * [A, B, C] - * - * We would partition our hom-ref sites into the following bands: - * - * X < A - * A <= X < B - * B <= X < C - * X >= C - * - * @param underlyingWriter the ultimate destination of the GVCF records - * @param GQPartitions a well-formed list of GQ partitions - */ - public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) { - if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null"); - this.underlyingWriter = underlyingWriter; - this.GQPartitions = parsePartitions(GQPartitions); - } - - /** - * Write the VCF header - * - * Adds standard GVCF fields to the header - * - * @param header a non-null header - */ - @Override - public void writeHeader(VCFHeader header) { - if ( header == null ) throw new IllegalArgumentException("header cannot be null"); - header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block")); - header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); - header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block")); - - for ( final HomRefBlock partition : GQPartitions ) { - header.addMetaDataLine(partition.toVCFHeaderLine()); - } - - underlyingWriter.writeHeader(header); - } - - /** - * Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well - */ - @Override - public void close() { - close(true); - } - - /** - * Horrible work around because there's no clean way to get our VCFWriter closed by the GATK - * - * If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open - * so the GATK closes it later - * - * @param closeUnderlyingWriter should we leave the underlying writer open or closed? - */ - public void close(final boolean closeUnderlyingWriter) { - emitCurrentBlock(); - if ( closeUnderlyingWriter ) underlyingWriter.close(); - } - - /** - * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate - * - * @param vc a non-null VariantContext - * @param g a non-null genotype from VariantContext - * @return a VariantContext to be emitted, or null if non is appropriate - */ - protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { - if ( nextAvailableStart != -1 && vc.getStart() <= nextAvailableStart ) { - // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) - return null; - } else if ( currentBlock == null ) { - currentBlock = createNewBlock(vc, g); - return null; - } else if ( currentBlock.withinBounds(g.getGQ()) ) { - currentBlock.add(vc.getStart(), g); - return null; - } else { - final VariantContext result = blockToVCF(currentBlock); - currentBlock = createNewBlock(vc, g); - return result; - } - } - - /** - * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null - */ - private void emitCurrentBlock() { - if ( currentBlock != null ) { - // there's actually some work to do - underlyingWriter.add(blockToVCF(currentBlock)); - currentBlock = null; - } - } - - /** - * Convert a HomRefBlock into a VariantContext - * - * @param block the block to convert - * @return a VariantContext representing the gVCF encoding for this block - */ - private VariantContext blockToVCF(final HomRefBlock block) { - if ( block == null ) throw new IllegalArgumentException("block cannot be null"); - - final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC()); - vcb.attributes(new HashMap(2)); // clear the attributes - vcb.stop(block.getStop()); - vcb.attribute(VCFConstants.END_KEY, block.getStop()); - vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize()); - - // create the single Genotype with GQ and DP annotations - final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef())); - gb.noAD().noPL().noAttributes(); // clear all attributes - gb.GQ(block.getMedianGQ()); - gb.DP(block.getMedianDP()); - gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); - gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); - gb.PL(block.getMinPLs()); - - return vcb.genotypes(gb.make()).make(); - } - - /** - * Helper function to create a new HomRefBlock from a variant context and current genotype - * - * @param vc the VariantContext at the site where want to start the band - * @param g the genotype of the sample from vc that should be used to initialize the block - * @return a newly allocated and initialized block containing g already - */ - private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) { - // figure out the GQ limits to use based on the GQ of g - HomRefBlock partition = null; - for ( final HomRefBlock maybePartition : GQPartitions ) { - if ( maybePartition.withinBounds(g.getGQ()) ) { - partition = maybePartition; - break; - } - } - if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition); - - // create the block, add g to it, and return it for use - final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound()); - block.add(vc.getStart(), g); - return block; - } - - /** - * Add a VariantContext to this writer for emission - * - * Requires that the VC have exactly one genotype - * - * @param vc a non-null VariantContext - */ - @Override - public void add(VariantContext vc) { - if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); - - if ( sampleName == null ) - sampleName = vc.getGenotype(0).getSampleName(); - - if ( ! vc.hasGenotypes() ) { - throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes"); - } else if ( vc.getGenotypes().size() != 1 ) { - throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size()); - } else { - if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) { - // we've made a non-contiguous step (across interval, onto another chr), so finalize - emitCurrentBlock(); - } - - final Genotype g = vc.getGenotype(0); - if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) { - // create bands - final VariantContext maybeCompletedBand = addHomRefSite(vc, g); - if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); - } else { - // g is variant, so flush the bands and emit vc - emitCurrentBlock(); - nextAvailableStart = vc.getEnd(); - underlyingWriter.add(vc); - } - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java deleted file mode 100644 index ebd167a31..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java +++ /dev/null @@ -1,186 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.gvcf; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFHeaderLine; - -import java.util.ArrayList; -import java.util.List; - -/** - * Helper class for calculating a GQ band in the GVCF writer - * - * A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes, - * and provides summary information about the entire block of genotypes. - * - * Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores - * - * User: depristo - * Date: 6/25/13 - * Time: 9:41 AM - */ -final class HomRefBlock { - private final VariantContext startingVC; - private int stop; - private final int minGQ, maxGQ; - private int[] minPLs = null; - final private List GQs = new ArrayList<>(100); - final private List DPs = new ArrayList<>(100); - private final Allele ref; - - /** - * Create a new HomRefBlock - * - * @param startingVC the VariantContext that starts this band (for starting position information) - * @param minGQ the minGQ (inclusive) to use in this band - * @param maxGQ the maxGQ (exclusive) to use in this band - */ - public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) { - if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null"); - if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); - - this.startingVC = startingVC; - this.stop = getStart() - 1; - this.ref = startingVC.getReference(); - this.minGQ = minGQ; - this.maxGQ = maxGQ; - } - - /** - * Create a new HomRefBlock only for doing bounds checking - * - * @param minGQ the minGQ (inclusive) to use in this band - * @param maxGQ the maxGQ (exclusive) to use in this band - */ - public HomRefBlock(int minGQ, int maxGQ) { - if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); - - this.startingVC = null; - this.stop = -1; - this.ref = null; - this.minGQ = minGQ; - this.maxGQ = maxGQ; - } - - /** - * Add information from this Genotype to this band - * @param g a non-null Genotype with GQ and DP attributes - */ - public void add(final int pos, final Genotype g) { - if ( g == null ) throw new IllegalArgumentException("g cannot be null"); - if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); - if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); - if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); - if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); - - if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs - final int[] PL = g.getPL(); - if( PL.length == 3 ) { - minPLs = PL.clone(); - } - } else { // otherwise take the min with the provided genotype's PLs - final int[] PL = g.getPL(); - if( PL.length == 3 ) { - minPLs[0] = Math.min(minPLs[0], PL[0]); - minPLs[1] = Math.min(minPLs[1], PL[1]); - minPLs[2] = Math.min(minPLs[2], PL[2]); - } - } - stop = pos; - GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission - DPs.add(g.getDP()); - } - - /** - * Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ) - * @param GQ the GQ value to test - * @return true if within bounds, false otherwise - */ - public boolean withinBounds(final int GQ) { - return GQ >= minGQ && GQ < maxGQ; - } - - /** Get the min GQ observed within this band */ - public int getMinGQ() { return MathUtils.arrayMin(GQs); } - /** Get the median GQ observed within this band */ - public int getMedianGQ() { return MathUtils.median(GQs); } - /** Get the min DP observed within this band */ - public int getMinDP() { return MathUtils.arrayMin(DPs); } - /** Get the median DP observed within this band */ - public int getMedianDP() { return MathUtils.median(DPs); } - /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ - public int[] getMinPLs() { return minPLs; } - - protected int getGQUpperBound() { return maxGQ; } - protected int getGQLowerBound() { return minGQ; } - - public boolean isContiguous(final VariantContext vc) { - return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr()); - } - - public VariantContext getStartingVC() { return startingVC; } - public int getStart() { return startingVC.getStart(); } - public int getStop() { return stop; } - public Allele getRef() { return ref; } - public int getSize() { return getStop() - getStart() + 1; } - - @Override - public String toString() { - return "HomRefBlock{" + - "minGQ=" + minGQ + - ", maxGQ=" + maxGQ + - '}'; - } - - public VCFHeaderLine toVCFHeaderLine() { - return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java deleted file mode 100644 index 6d839a832..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ /dev/null @@ -1,332 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.haplotypeBAMWriter; - -import net.sf.samtools.Cigar; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMTag; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; - -import java.util.*; - -/** - * A BAMWriter that aligns reads to haplotypes and emits their best alignments to a BAM file - * - * User: depristo - * Date: 2/22/13 - * Time: 2:59 PM - */ -public abstract class HaplotypeBAMWriter { - /** - * Allows us to write out unique names for our synthetic haplotype reads - */ - private long uniqueNameCounter = 1; - - protected final static String READ_GROUP_ID = "ArtificialHaplotype"; - protected final static String HAPLOTYPE_TAG = "HC"; - - private final ReadDestination output; - private boolean writeHaplotypesAsWell = true; - private boolean onlyRealignInformativeReads = false; - - /** - * Possible modes for writing haplotypes to BAMs - */ - public static enum Type { - /** - * A mode that's for method developers. Writes out all of the possible - * haplotypes considered, as well as reads aligned to each - */ - ALL_POSSIBLE_HAPLOTYPES, - - /** - * A mode for users. Writes out the reads aligned only to the called - * haplotypes. Useful to understand why the caller is calling what it is - */ - CALLED_HAPLOTYPES - } - - /** - * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer - * - * @param type the type of the writer we want to create - * @param stingSAMWriter the destination, must not be null - * @param header the header of the input BAMs used to make calls, must not be null - * @return a new HaplotypeBAMWriter - */ - public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) { - if ( type == null ) throw new IllegalArgumentException("type cannot be null"); - - final ReadDestination toBam = new ReadDestination.ToBAM(stingSAMWriter, header, READ_GROUP_ID); - return create(type, toBam); - } - - /** - * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer - * - * Note that writer must have its presorted bit set to false, as reads - * may come in out of order during writing - * - * @param type the type of the writer we want to create - * @param destination the destination, must not be null - * @return a new HaplotypeBAMWriter - */ - public static HaplotypeBAMWriter create(final Type type, final ReadDestination destination) { - if ( destination == null ) throw new IllegalArgumentException("writer cannot be null"); - if ( type == null ) throw new IllegalArgumentException("type cannot be null"); - - switch ( type ) { - case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(destination); - case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(destination); - default: throw new IllegalArgumentException("Unknown type " + type); - } - } - - /** - * Create a new HaplotypeBAMWriter writing its output to bamWriter - * - * Assumes that the header has been fully initialized with a single - * read group READ_GROUP_ID - * - * @param output our output destination - */ - protected HaplotypeBAMWriter(final ReadDestination output) { - this.output = output; - } - - /** - * Write out a BAM representing for the haplotype caller at this site - * - * @param haplotypes a list of all possible haplotypes at this loc - * @param paddedReferenceLoc the span of the based reference here - * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping - * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference - * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes - */ - public abstract void writeReadsAlignedToHaplotypes(final Collection haplotypes, - final GenomeLoc paddedReferenceLoc, - final Collection bestHaplotypes, - final Set calledHaplotypes, - final Map stratifiedReadMap); - - public void writeReadsAlignedToHaplotypes(final Collection haplotypes, - final GenomeLoc paddedReferenceLoc, - final Map stratifiedReadMap) { - writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap); - } - - /** - * Write out read aligned to haplotype to the BAM file - * - * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference - * via the alignment of haplotype (via its getCigar) method. - * - * @param originalRead the read we want to write aligned to the reference genome - * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference - * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. - * @param isInformative true if the read is differentially informative for one of the haplotypes - */ - protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead, - final Haplotype haplotype, - final int referenceStart, - final boolean isInformative) { - if( onlyRealignInformativeReads && !isInformative ) { - if( originalRead != null ) { - output.add(originalRead); - } - } else if (haplotype == null) { - output.add(originalRead); - return; - } else { - final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); - if ( alignedToRef != null ) { - output.add(alignedToRef); - } else { - output.add(originalRead); - } - } - } - - /** - * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference - * via the alignment of haplotype (via its getCigar) method. - * - * @param originalRead the read we want to write aligned to the reference genome - * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference - * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. - * @param isInformative true if the read is differentially informative for one of the haplotypes - * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible - */ - protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, - final Haplotype haplotype, - final int referenceStart, - final boolean isInformative) { - if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null"); - if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null"); - if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype); - if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); - - try { - // compute the smith-waterman alignment of read -> haplotype - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), Path.NEW_SW_PARAMETERS); - //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); - if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) - // sw can fail (reasons not clear) so if it happens just don't write the read - return null; - final Cigar swCigar = AlignmentUtils.consolidateCigar(swPairwiseAlignment.getCigar()); - - // since we're modifying the read we need to clone it - final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone(); - - addHaplotypeTag(read, haplotype); - - // uninformative reads are set to zero mapping quality to enhance visualization - if ( !isInformative ) - read.setMappingQuality(0); - - // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar - final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000); - final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1()); - final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; - read.setAlignmentStart(readStartOnReference); - - // compute the read -> ref alignment by mapping read -> hap -> ref from the - // SW of read -> hap mapped through the given by hap -> ref - final Cigar haplotypeToRef = AlignmentUtils.trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1); - final Cigar readToRefCigarRaw = AlignmentUtils.applyCigarToCigar(swCigar, haplotypeToRef); - final Cigar readToRefCigarClean = AlignmentUtils.cleanUpCigar(readToRefCigarRaw); - final Cigar readToRefCigar = AlignmentUtils.leftAlignIndel(readToRefCigarClean, haplotype.getBases(), - originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true); - - read.setCigar(readToRefCigar); - - if ( readToRefCigar.getReadLength() != read.getReadLength() ) - throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() - + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() - + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); - - return read; - } catch ( CloneNotSupportedException e ) { - throw new IllegalStateException("GATKSAMRecords should support clone but this one does not " + originalRead); - } - } - - /** - * Add a haplotype tag to the read based on haplotype - * - * @param read the read to add the tag to - * @param haplotype the haplotype that gives rises to read - */ - private void addHaplotypeTag(final GATKSAMRecord read, final Haplotype haplotype) { - // add a tag to the read that indicates which haplotype it best aligned to. It's a uniquish integer - read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); - } - - /** - * Write out haplotypes as reads to the BAM, marking specifically those that are among the best haplotypes - * - * @param haplotypes a collection of haplotypes to write to the BAM - * @param bestHaplotypes a subset of haplotypes that contains those that are best "either good or called" - * @param paddedReferenceLoc the genome loc of the padded reference - */ - protected void writeHaplotypesAsReads(final Collection haplotypes, - final Set bestHaplotypes, - final GenomeLoc paddedReferenceLoc) { - if ( isWriteHaplotypesAsWell() ) - for ( final Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); - } - - /** - * Write out a representation of this haplotype as a read - * - * @param haplotype a haplotype to write out. Cannot be null - * @param paddedRefLoc the reference location. Cannot be null - * @param isAmongBestHaplotypes true if among the best haplotypes, false if it was just one possible but not so good - */ - private void writeHaplotype(final Haplotype haplotype, - final GenomeLoc paddedRefLoc, - final boolean isAmongBestHaplotypes) { - final GATKSAMRecord record = new GATKSAMRecord(output.getHeader()); - record.setReadBases(haplotype.getBases()); - record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); - record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); - record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); - record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); - record.setReadName("HC" + uniqueNameCounter++); - addHaplotypeTag(record, haplotype); - record.setReadUnmappedFlag(false); - record.setReferenceIndex(paddedRefLoc.getContigIndex()); - record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); - record.setFlags(16); - output.add(record); - } - - public boolean isWriteHaplotypesAsWell() { - return writeHaplotypesAsWell; - } - - public void setWriteHaplotypesAsWell(final boolean writeHaplotypesAsWell) { - this.writeHaplotypesAsWell = writeHaplotypesAsWell; - } - - public boolean getOnlyRealignInformativeReads() { - return onlyRealignInformativeReads; - } - - public void setOnlyRealignInformativeReads(final boolean onlyRealignInformativeReads) { - this.onlyRealignInformativeReads = onlyRealignInformativeReads; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java deleted file mode 100644 index a693ec22d..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java +++ /dev/null @@ -1,450 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.Arrays; - -/** - * Created with IntelliJ IDEA. - * User: bradt - * Date: 6/11/13 - */ -public class ArrayLoglessPairHMM extends PairHMM { - private static final double INITIAL_CONDITION = Math.pow(2, 1020); - private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); - - // we divide e by 3 because the observed base could have come from any of the non-observed alleles - protected static final double TRISTATE_CORRECTION = 3.0; - - private static final int matchToMatch = 0; - private static final int indelToMatch = 1; - private static final int matchToInsertion = 2; - private static final int insertionToInsertion = 3; - private static final int matchToDeletion = 4; - private static final int deletionToDeletion = 5; - - protected double[][] transition = null; // The transition probabilities cache - protected double[][] prior = null; // The prior probabilities cache - - // Array declarations for arrays implementation - private double[] currentMatchArray = null; - private double[] currentDeleteArray = null; - private double[] currentInsertArray = null; - private double[] parentMatchArray = null; - private double[] parentDeleteArray = null; - private double[] parentInsertArray = null; - private double[] grandparentMatchArray = null; - private double[] grandparentDeleteArray = null; - private double[] grandparentInsertArray = null; - - // When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading - private double[] matchCacheArray = null; - private double[] deleteCacheArray = null; - private double[] insertCacheArray = null; - - // These arrays store cache info for use with the next haplotype; for writing - private double[] nextMatchCacheArray = null; - private double[] nextDeleteCacheArray = null; - private double[] nextInsertCacheArray = null; - - // Used when caching to store our intermediate sum at point of first difference bw successive haplotypes - private double partialSum; - - - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - - // Initialize all arrays - // Final Cell of array is a padding cell, initialized to zero. - currentMatchArray = new double[paddedMaxReadLength]; - currentDeleteArray = new double[paddedMaxReadLength]; - currentInsertArray = new double[paddedMaxReadLength]; - - parentMatchArray = new double[paddedMaxReadLength]; - parentDeleteArray = new double[paddedMaxReadLength]; - parentInsertArray = new double[paddedMaxReadLength]; - - grandparentMatchArray = new double[paddedMaxReadLength]; - grandparentDeleteArray = new double[paddedMaxReadLength]; - grandparentInsertArray = new double[paddedMaxReadLength]; - - // Initialize the special arrays used for caching when successive haplotypes have a common prefix - matchCacheArray = new double[paddedMaxReadLength]; - deleteCacheArray = new double[paddedMaxReadLength]; - insertCacheArray = new double[paddedMaxReadLength]; - - nextMatchCacheArray = new double[paddedMaxReadLength]; - nextDeleteCacheArray = new double[paddedMaxReadLength]; - nextInsertCacheArray = new double [paddedMaxReadLength]; - - } - - - /** - * {@inheritDoc} - */ - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex) { - - if ( ! constantsAreInitialized) { - initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); - - // note that we initialized the constants - constantsAreInitialized = true; - } - initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - - // Some housekeeping to be done if we are starting a new read - if (recacheReadValues) { - hapStartIndex = 0; - - initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); - // note that we initialized the constants - constantsAreInitialized = true; - - // Read length may have changed, so we need to set zero-value padding at the appropriate position. - padMatchAndInsertArrays(readBases.length); - } - - // if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration - if (hapStartIndex == 0) { - clearPreviouslyCachedInfo(readBases.length); - - // Haplotype length may have changed, so we need to set initial-value padding at the appropriate position. - padDeleteArrays(haplotypeBases.length, readBases.length); - } - - // We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. - clearArraySolutionPosition(); - - // Some parameters to control behavior during the dynamic programming loop - final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1; - int startFill; // The lower bound of the array indices we want to over-write - int endFill; // The upper bound of the array indices we want to over-write - final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype - double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction - - // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix - for (int i = 1; i <= maxDiagonals; i++) { - // set the bounds for cells we wish to fill in the arrays - startFill = Math.max(readBases.length - i, 0); - endFill = Math.min(maxDiagonals - i + 1, readBases.length); - - // apply any previously cached array information - if (i <= readBases.length) - applyPreviouslyCachedInfo(startFill); - - // fill in the cells for our current arrays - updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i); - - // final probability is the log10 sum of the last element in the Match and Insertion state arrays - // this way we ignore all paths that ended in deletions! (huge) - // but we have to sum all the paths ending in the M and I arrays, because they're no longer extended. - // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above. - finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0]; - - // Partial sum for caching the next haplotype: - // At the position of the last similar base between this haplotype and the next one... - // ...remember the partial sum, so that we can start here on the next hap. - if (i == cacheSumIndex) - partialSum = finalArraySumProbabilities; - - rotateArrayReferences(); - } - // The cache arrays we wrote for this haplotype will be read for the next haplotype. - rotateCacheArrays(); - - //return result - return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; - } - - /** - * Initializes the matrix that holds all the constants related to the editing - * distance between the read and the haplotype. - * - * @param haplotypeBases the bases of the haplotype - * @param readBases the bases of the read - * @param readQuals the base quality scores of the read - * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) - */ - public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { - - // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases - // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. - - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = startIndex; j < haplotypeBases.length; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); - } - } - } - - /** - * Initializes the matrix that holds all the constants related to quality scores. - * - * @param insertionGOP insertion quality scores of the read - * @param deletionGOP deletion quality scores of the read - * @param overallGCP overall gap continuation penalty - */ - @Requires({ - "insertionGOP != null", - "deletionGOP != null", - "overallGCP != null" - }) - @Ensures("constantsAreInitialized") - protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - for (int i = 0; i < insertionGOP.length; i++) { - final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); - transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); - transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); - transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); - transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); - transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); - } - } - - /** - * Pad the ends of the Match and Insert arrays with 0. - * Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM. - * - * @param padPosition Which index in the arrays we wish to pad - */ - private void padMatchAndInsertArrays(final int padPosition) { - grandparentMatchArray[padPosition] = 0; - grandparentInsertArray[padPosition] = 0; - parentMatchArray[padPosition] = 0; - parentInsertArray[padPosition] = 0; - currentMatchArray[padPosition] = 0; - currentInsertArray[padPosition] = 0; - matchCacheArray[padPosition] = 0; - insertCacheArray[padPosition] = 0; - nextMatchCacheArray[padPosition] = 0; - nextInsertCacheArray[padPosition] = 0; - } - - /** - * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment. - * Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM. - * - * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value - * @param padPosition Which index in the arrays we wish to pad - */ - private void padDeleteArrays(final int haplotypeLength, final int padPosition) { - final double initialValue = INITIAL_CONDITION / haplotypeLength; - - // Pad the deletion arrays. Akin to padding the first row in the deletion matrix - parentDeleteArray[padPosition] = initialValue; - grandparentDeleteArray[padPosition] = initialValue; - currentDeleteArray[padPosition] = initialValue; - deleteCacheArray[padPosition] = initialValue; - nextDeleteCacheArray[padPosition] = initialValue; - } - - /** - * We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. - * - */ - private void clearArraySolutionPosition() { - grandparentMatchArray[0] = 0; - grandparentInsertArray[0] = 0; - parentMatchArray[0] = 0; - parentInsertArray[0] = 0; - currentMatchArray[0] = 0; - currentInsertArray[0] = 0; - } - - /** - * Clears cached information saved from the last haplotype, - * allowing us to start at the beginning of the present haplotype with intitial values of 0. - * - * @param fillLength How much of the cache arrays do we need to zero - */ - private void clearPreviouslyCachedInfo(final int fillLength) { - Arrays.fill(matchCacheArray, 0, fillLength, 0); - Arrays.fill(deleteCacheArray, 0, fillLength, 0); - Arrays.fill(insertCacheArray, 0, fillLength, 0); - - partialSum = 0; - } - - /** - * Applies cached information saved from the last haplotype, - * allowing us to start in the middle of the present haplotype. - * - * @param indK the index in the arrays we wish to update with cached info - */ - private void applyPreviouslyCachedInfo(int indK) { - // apply caching info necessary for calculating current DELETE array values - parentMatchArray[indK] = matchCacheArray[indK]; - parentDeleteArray[indK] = deleteCacheArray[indK]; - - // apply caching info necessary for calculating current MATCH array values - grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1]; - grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1]; - grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1]; - } - - /** - * Records the mid-process state of one location in the read/haplotype alignment. - * Writes new cache information for use with the next haplotype we see. - * - * @param indK the index in the cache arrays we wish to store information in - */ - private void recordNewCacheInfo(int indK) { - nextMatchCacheArray[indK] = currentMatchArray[indK]; - nextDeleteCacheArray[indK] = currentDeleteArray[indK]; - nextInsertCacheArray[indK] = currentInsertArray[indK]; - } - - /** - * Update the HMM arrays for the current diagonal. - * - * @param readLength The length of the read - * @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype - * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info - * @param startFill The lower bound of the array indices we want to over-write - * @param endFill The upper bound of the array indices we want to over-write - * @param iii The index indicating which diagonal of the read/haplotype alignment we are working on - */ - private void updateArrays(final int readLength, - final int hapStartIndex, - final int nextHapStartIndex, - final int startFill, - final int endFill, - final int iii) { - - // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment - int matrixRow; - int matrixCol; - - int arrayIndex; - for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { - // translate the array position into a row, column in the priors and transition matrices - matrixRow = readLength - arrayIndex - 1; - matrixCol = iii - matrixRow - 1 + hapStartIndex; - - // update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col - updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); - - // Set up caching for the next haplotype - // At the position of the final similar base between this haplotype and the next one, remember the mid-array values - if (matrixCol == nextHapStartIndex - 1) - recordNewCacheInfo(arrayIndex); - } - } - - /** - * Updates a cell in the HMM arrays - * - * @param indK index in the arrays to update - * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transition an array with the six transition relevant to this location - */ - private void updateArrayCell( final int indK, final double prior, final double[] transition) { - currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] + - grandparentInsertArray[indK + 1] * transition[indelToMatch] + - grandparentDeleteArray[indK + 1] * transition[indelToMatch] ); - currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion]; - currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; - } - - /** - * To prepare for the next diagonal in our loop, each array must be bumped to an older generation - * - */ - private void rotateArrayReferences() { - double[] tempMatchArray = grandparentMatchArray; - double[] tempDeleteArray = grandparentDeleteArray; - double[] tempInsertArray = grandparentInsertArray; - - grandparentMatchArray = parentMatchArray; - grandparentDeleteArray = parentDeleteArray; - grandparentInsertArray = parentInsertArray; - - parentMatchArray = currentMatchArray; - parentDeleteArray = currentDeleteArray; - parentInsertArray = currentInsertArray; - - currentMatchArray = tempMatchArray; - currentDeleteArray = tempDeleteArray; - currentInsertArray = tempInsertArray; - } - - /** - * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays - * - */ - private void rotateCacheArrays() { - matchCacheArray = nextMatchCacheArray.clone(); - deleteCacheArray = nextDeleteCacheArray.clone(); - insertCacheArray = nextInsertCacheArray.clone(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java deleted file mode 100644 index 2872bea37..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java +++ /dev/null @@ -1,813 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - -/** - * Fast partial PairHMM backed on the standard Logless PairHMM - * - */ -public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { - - - /** - * Initial read length capacity. - */ - private static final int INITIAL_READ_LENGTH_CAPACITY = 200; - - /** - * Initial haplotype length capacity. - */ - private static final int INITIAL_HAPLOTYPE_LENGTH_CAPACITY = 400; - - - /** - * Holds the current read capacity. - *

It can only go up overtime.

- */ - private int readCapacity = INITIAL_READ_LENGTH_CAPACITY; - - /** - * Holds the current haplotype length capacity. - *

It can only go up overtime.

- */ - private int haplotypeCapacity = INITIAL_HAPLOTYPE_LENGTH_CAPACITY; - - private int maxToCol; - private int haplotypeLength; - - /** - * Returns the currently loaded read base qualities. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - public byte[] getReadQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readQuals; - } - - /** - * Returns the currently loaded read insertion qualities. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - @SuppressWarnings("unused") - public byte[] getReadInsQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readInsQuals; - } - - /** - * Returns the currently loaded read deletion qualities. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - @SuppressWarnings("unused") - public byte[] getReadDelQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readDelQuals; - } - - /** - * Returns the currently loaded read gap extension penalty.. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - @SuppressWarnings("unused") - public byte[] getReadGepQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readGepQuals; - } - - - /** - * Creates a new pair-hmm calculator instance give the gap continuation penalty. - * - * @param gcp the gap-continuation penalty. - */ - public FastLoglessPairHMM(final byte gcp) { - constantGCP = gcp; - initialize(readCapacity,haplotypeCapacity); - } - - - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, final int nextHapStartIndex) { - this.readBases = readBases; - this.haplotypeBases = haplotypeBases; - this.haplotypeLength = haplotypeBases.length; - return super.subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases,readBases,readQuals, - insertionGOP,deletionGOP,overallGCP,hapStartIndex,recacheReadValues,nextHapStartIndex); - } - - /** - * Implement the last step summation to calculate the total likelihood. - * - * @param row number of the last row of the pair-hmm where the likelihood values are present. - * @param fromCol inclusive first column to include in the summation. - * @param toCol exclusive last column to include in the summation. - * @return 0 or less. - */ - protected double finalLikelihoodCalculation(final int row, - final int fromCol, final int toCol) { - - final double divider = Math.max(1,2 *(toCol - fromCol)); - final double dividerInverse = 1.0 / divider; - double finalLikelihood = 0; - - for (int j = fromCol; j < toCol; j++) { - finalLikelihood += matchMatrix[row][j] * dividerInverse; - finalLikelihood += insertionMatrix[row][j] * dividerInverse; - } - return StrictMath.log10(finalLikelihood) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); - } - - /** - * Initialize the matrix values for a problem including the trailing end of the read. - * - *

- * Notice that you can improve performance by omitting filling reusable values from - * previous haplotype calculations. You can set {@code haplotypeStartOffset} to skill - * those columns. - *

- * - * @param readStart inclusive first position of the read used in the calculations. - * @param readEnd exclusive last position of the read considered in the calculations. - * @param haplotypeStartOffset offset of the haplotype right after the reusable prefix - * from previous calls. - * - * - */ - protected void initializeMatrixValuesForTrailingProblem(final int readStart, final int readEnd, - final int haplotypeStartOffset) { - - @SuppressWarnings("all") - final int zeroRow = readStart; - final int toRow = readEnd + 1; - final int toCol = haplotypeLength + 1; - - // fill first row with -Inf fot M and I but not for Deletion if leading - // to allow for free deletions at the beginning. - if (readStart == 0) { - // First row initialization: - Arrays.fill(matchMatrix[zeroRow],haplotypeStartOffset,toCol,0); - Arrays.fill(deletionMatrix[zeroRow],haplotypeStartOffset,toCol,INITIAL_CONDITION); - - if (haplotypeStartOffset == 0) - for (int i = zeroRow + 1; i < toRow; i++) - insertionMatrix[i][0] = matchMatrix[i][0] = deletionMatrix[i][0] = 0; - - } else { - Arrays.fill(matchMatrix[zeroRow], Math.max(1,haplotypeStartOffset), toCol,0); - Arrays.fill(insertionMatrix[zeroRow], haplotypeStartOffset, toCol,0); - if (haplotypeStartOffset == 0) { - matchMatrix[zeroRow][0] = INITIAL_CONDITION; - deletionMatrix[zeroRow][0] = 0; - } - if (haplotypeStartOffset <= 1) deletionMatrix[zeroRow][1] = matchMatrix[zeroRow][1] * transition[zeroRow][matchToDeletion]; - for (int i = Math.max(haplotypeStartOffset,2); i < toCol; i++) { - deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] - * transition[zeroRow][deletionToDeletion]; - } - - if (haplotypeStartOffset == 0) { - matchMatrix[zeroRow + 1][0] = deletionMatrix[zeroRow + 1][0] = 0; - insertionMatrix[zeroRow + 1][0] = matchMatrix[zeroRow][0] * transition[zeroRow + 1][matchToInsertion]; - - - for (int i = zeroRow + 2; i < toRow; i++) { - matchMatrix[i][0] = deletionMatrix[i][0] = 0; - insertionMatrix[i][0] = insertionMatrix[i - 1][0] - * transition[i][insertionToInsertion]; - } - } - } - } - - /** - * Initializes calculation matrices give the characteristics of the next and previous problems. - * @param currentProblem reference to the Lk calculation problem we are dealing currently. - * @param previousProblem reference to the Lk calculation problem that has been solved just before. - * - */ - protected void initializeMatrixValues(final Problem currentProblem, final Problem previousProblem) { - if (previousProblem != null && - previousProblem.readStart == currentProblem.readStart && - previousProblem.hapStart == currentProblem.hapStart && - maxToCol >= currentProblem.hapEnd + 1) - return; - - final int zeroRow = currentProblem.readStart; - final int zeroCol = currentProblem.hapStart; - final int toRow = currentProblem.readEnd + 1; - final int toCol = currentProblem.hapEnd + 1; - maxToCol = toCol; - - // fill first row with -Inf fot M and I but not for Deletion if leading - // to allow for free deletions at the beginning. - if (currentProblem.leading) { - // First row initialization: - Arrays.fill(matchMatrix[zeroRow],zeroCol,toCol,0); - Arrays.fill(deletionMatrix[zeroRow],zeroCol,toCol,INITIAL_CONDITION); - - for (int i = zeroRow + 1; i < toRow; i++) - insertionMatrix[i][zeroCol] = matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; - - } else { // If not leading set the appropriate matching 1.0 prob and - // deletion + extension. - - Arrays.fill(matchMatrix[zeroRow], zeroCol + 1, toCol,0); - Arrays.fill(insertionMatrix[zeroRow], zeroCol, toCol,0); - matchMatrix[zeroRow][zeroCol] = INITIAL_CONDITION; - deletionMatrix[zeroRow][zeroCol] = 0; - deletionMatrix[zeroRow][zeroCol + 1] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow][matchToDeletion]; - for (int i = zeroCol + 2; i < toCol; i++) { - deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] - * transition[zeroRow][deletionToDeletion]; - } - - matchMatrix[zeroRow + 1][zeroCol] = deletionMatrix[zeroRow + 1][zeroCol] = 0; - insertionMatrix[zeroRow + 1][zeroCol] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow + 1][matchToInsertion]; - - for (int i = zeroRow + 2; i < toRow; i++) { - matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; - insertionMatrix[i][zeroCol] = insertionMatrix[i - 1][zeroCol] - * transition[i][insertionToInsertion]; - } - } - } - - /** - * Constant gap-continuation-penalty. - */ - private final byte constantGCP; - - /** - * Currently loaded haplotype base sequence. - */ - private byte[] haplotypeBases; - - /** - * Currently loaded read base sequence. - */ - private byte[] readBases; - - /** - * Read qualities. - */ - private byte[] readQuals; - - /** - * Read insertion qualities. - */ - private byte[] readInsQuals; - - /** - * Read deletion qualities. - */ - private byte[] readDelQuals; - - /** - * Read gap-extension-penalties. - */ - private byte[] readGepQuals; - - /** - * Cached results. - */ - private Map cachedResults = new HashMap<>(); - - /** - * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. - * - * @param read the target read. - * @throws NullPointerException if {@code read} is null. - */ - @Override - public void loadRead(final GATKSAMRecord read) { - loadRead(read.getReadBases(),read.getBaseQualities(),read.getBaseInsertionQualities(),read.getBaseDeletionQualities(),read.getMappingQuality()); - } - - /** - * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. - * - * @param readBases the read bases. - * @param readQuals the read base call quality scores. - * @param readInsQuals the read insertion quality scores. - * @param readDelQuals the read deletion quality scores. - * @param mq the read mapping quality score. - * @throws NullPointerException if any of the arrays passed is {@code null}. - * @throws IllegalArgumentException if the arrays passed have incompatible sizes. - */ - public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) { - // TODO This is a copy&paste from PairHMM*Engine read data preparation code. - // TODO It is simply to difficult to share the code without changing that class and I don't want - // TODO to do so for now. - if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length"); - if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length"); - if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length"); - maxToCol = 0; - - if (readBases.length > readCapacity) { - readCapacity = readBases.length; - initialize(readCapacity,haplotypeCapacity); - } - paddedReadLength = readBases.length + 1; - final byte[] overallGCP = new byte[readBases.length]; - Arrays.fill(overallGCP, constantGCP); // Is there a way to derive - - for (int kkk = 0; kkk < readQuals.length; kkk++) { - readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], - mq); // cap base quality by mapping - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = (readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE - : readQuals[kkk]); - } - this.readBases = readBases; - this.readQuals = readQuals; - this.readInsQuals = readInsQuals; - this.readDelQuals = readDelQuals; - this.readGepQuals = overallGCP; - initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP); - if (haplotypeBases != null) - fillPriorsTable(0); - cachedResults.clear(); - } - - @Override - public void loadHaplotypeBases(final byte[] haplotypeBases) { - if (readBases == null) - throw new IllegalStateException( - "no read was loaded before the haplotype"); - this.haplotypeBases = haplotypeBases.clone(); - haplotypeLength = haplotypeBases.length; - paddedHaplotypeLength = haplotypeLength; - if (haplotypeCapacity < haplotypeLength) { - haplotypeCapacity = haplotypeLength; - initialize(readCapacity,haplotypeCapacity); - initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); - } - initializePriors(this.haplotypeBases, readBases, readQuals, 0); - } - - - /** - * Changes only the suffix of the currently loaded haplotype. - * - *

- * If from is 0, this is equivalent to call {@link #loadHaplotypeBases(byte[])} directly. - *

- * @param from first position on the current haplotype to substitute with the new suffix. - * It can be up to the length of the haplotype in such case this operation is in - * effect just extending that haplotype. - * @param suffix the new bases for the end part of the current haplotype. - * @param suffixFrom inclusive first position of the actual suffix within the {@code suffix} array. - * @param suffixTo exclusive last position of the actual suffix within the {@code suffix} array. - * - * @throws IllegalStateException if no read was loaded with {@link #loadRead}. - * @throws IllegalArgumentException if from is more than 0 but no haplotype was loaded previously or if indices passed are inconsistent. - * @throws ArrayIndexOutOfBoundsException if indices passed are outside valid ranges. - */ - public void changeHaplotypeSuffix(final int from, final byte[] suffix, final int suffixFrom, final int suffixTo) { - if (readBases == null) - throw new IllegalStateException( - "no read was loaded before the haplotype"); - if (haplotypeBases == null && from > 0) - throw new IllegalArgumentException("from cannot be larger than 0 if no haplotype bases was previously loaded"); - if (suffixFrom < 0) - throw new ArrayIndexOutOfBoundsException("the suffix from index cannot be negative"); - if (suffixTo > suffix.length) - throw new ArrayIndexOutOfBoundsException("the suffix to index cannot be larger than the suffix array length"); - if (suffixFrom > suffixTo) - throw new IllegalArgumentException("the suffix to index cannot be smaller than the suffix from index"); - if (from > haplotypeLength) - throw new IllegalArgumentException("the from index cannot be greater than the current haplotype length"); - if (from < 0) - throw new IllegalArgumentException("the from index cannot be negative"); - - int startIndex = from; - if (haplotypeBases == null) { - haplotypeBases = Arrays.copyOfRange(suffix,suffixFrom,suffixTo); - haplotypeLength = suffixTo - suffixFrom; - } else { - final int newLength = from + suffixTo - suffixFrom; - if (haplotypeBases.length < newLength) - haplotypeBases = Arrays.copyOf(haplotypeBases,newLength); - System.arraycopy(suffix,suffixFrom,haplotypeBases,from,newLength - from); - haplotypeLength = newLength; - } - paddedHaplotypeLength = haplotypeLength + 1; - if (haplotypeCapacity < haplotypeLength) { - haplotypeCapacity = haplotypeLength; - initialize(readCapacity,haplotypeCapacity); - initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); - startIndex = 0; - } - //startIndex = 0; - fillPriorsTable(startIndex); - } - - /** - * Returns the bases of the current haplotype. - * - * @throws IllegalStateException if no haplotype was loaded previously - * @return never {@code null} - */ - public byte[] getHaplotypeBases() { - if (haplotypeBases == null) - throw new IllegalStateException(); - return Arrays.copyOfRange(haplotypeBases,0,haplotypeLength); - } - - /** - * Returns a debug representation of the pair-hmm. - * @return never {@code null}. - */ - public String toString() { - return "" + haplotypeLength + ":" + new String(Arrays.copyOfRange(haplotypeBases,0,haplotypeLength)); - } - - @Override - protected void initializePriors(final byte[] hapBases, final byte[] readBases, final byte[] baseQuals, final int idx) { - haplotypeBases = hapBases; - haplotypeLength = haplotypeBases.length; - this.readBases = readBases; - this.readQuals = baseQuals; - fillPriorsTable(idx); - } - - /** - * Fills the prior table up. - * - *

- * It accepts an argument to save unnecessary prefix filling up. - *

- * - * @param idx first position in the haplotype to start filling from. - */ - protected void fillPriorsTable(final int idx) { - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = idx; j < haplotypeLength; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); - } - } - } - - - /** - * Decorates haplotype set with their likelihoods as compared with the currently loaded read. - * - * - * @param readStart inclusive start position of the targeted section of the read. - * @param readEnd exclusive end position just beyond the targeted section of the read. - * @param haplotypes in/out set of haplotypes. - */ - public void calculateLocalLikelihoods(final int readStart, final int readEnd, final PairHMMReadyHaplotypes haplotypes) { - final PairHMMReadyHaplotypes.Iterator entryIterator = haplotypes.iterator(); - boolean isFirst = true; - while (entryIterator.hasNext()) { - entryIterator.next(); - final int startIndex = entryIterator.startIndex(); - final byte[] bases = entryIterator.bases(); - changeHaplotypeSuffix(startIndex,bases,startIndex,bases.length); - final double likelihood = calculateLikelihood(readStart, readEnd, startIndex, isFirst); - isFirst = false; - entryIterator.setLikelihood(likelihood); - } - } - - - - @Override - public double calculateLocalLikelihood(final int readStart, final int readEnd, - final int hapStart, final int hapEnd, final boolean kmerMatch) { - if (readBases == null || haplotypeBases == null) - throw new IllegalStateException("read or haplotype was not loaded"); - final int hapSegmentLength = hapEnd - hapStart; - final int readSegmentLength = readEnd - readStart; - // trivial case when there is a single base match. - if (kmerMatch) { - return calculateLocalLikelihoodsExactMatch(readStart, hapStart, hapSegmentLength, readSegmentLength); - } else if (hapSegmentLength == readSegmentLength) { - if (hapSegmentLength == 0) { - return calculateLocalLikelihoodEmptySquare(readStart, readEnd); - } else if (hapSegmentLength == 1) { - return calculateLocalLikelihoodSingleBase(readStart, readEnd, hapStart); - } else { // general (slower) solution. - return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); - } - } else if (hapSegmentLength == 0) { // must be full insertion we - return calculateLocalLikelihoodInsertion(readStart, readEnd); - } else if (readSegmentLength == 0) { // full deletion. - return calculateLocalLikelihoodDeletion(readStart, hapStart, hapEnd); - } else { // general (slower) solution. - return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); - } - } - - /** - * Fast likelihood when the pair-hmm represents a deletion in the read. - */ - private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) { - double result = INITIAL_CONDITION; - if (readStart > 0) { // no penalty if at the beginning. - result *= transition[readStart][matchToDeletion]; - result *= - StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1); - result *= transition[readStart][indelToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - - /** - * Fast likelihood when the pair-hmm represents a insertion in the read. - */ - private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) { - double result = INITIAL_CONDITION; - result *= transition[readStart + 1][matchToInsertion]; - for (int i = readStart + 1; i < readEnd; i++) { - result *= transition[i + 1][insertionToInsertion]; - } - if (readEnd < readBases.length) { - result *= transition[readEnd + 1][indelToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Single base mismatch fast likelihood calculation. - */ - private double calculateLocalLikelihoodSingleBase(final int readStart, final int readEnd, final int hapStart) { - double result = INITIAL_CONDITION; - result *= prior[readStart + 1][hapStart + 1]; - if (readStart > 0) { - result *= transition[readStart + 1][matchToMatch]; - } - if (readEnd < readBases.length) { - result *= transition[readEnd + 1][matchToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Empty square Pair-hmm. - */ - private double calculateLocalLikelihoodEmptySquare(final int readStart, final int readEnd) { - double result = INITIAL_CONDITION; - if (readStart > 0 && readEnd < readBases.length) { - result *= transition[readStart + 1][matchToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Likelihood assuming that there is a exact match between both sequences: read and haplotype - */ - private double calculateLocalLikelihoodsExactMatch(final int readStart, final int hapStart, final int hapSegmentLength, final int readSegmentLength) { - double result = INITIAL_CONDITION; - if (hapSegmentLength == 1) { - result *= prior[readStart + 1][hapStart + 1]; - } else { - for (int i = 0; i < readSegmentLength; i++) { - result *= prior[readStart + i + 1][hapStart + i + 1]; - if (i > 0) { - result *= transition[readStart + i + 1][matchToMatch]; - } - } - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Revert to a general pair-hmm solution. - */ - private double calculateLocalLikelihoodsGeneral(final int readStart, final int readEnd, final int hapStart, final int hapEnd) { - final Problem p = new Problem(readStart, readEnd, hapStart, hapEnd); - final Double cachedCost = cachedResults.get(p); - if (cachedCost != null) { - return cachedCost; - } - double cost = calculateLocalLikelihoodGeneral(p); - cachedResults.put(p, cost); - return cost; - } - - /** - * Resolve the regular full pair-hmm. - * - *

- * With the possibility of reuse the previous haplotype common prefix by using - * a startIndex which is greater than 0. - */ - private double calculateLikelihood(final int readStart, final int readEnd, final int startIndex, final boolean initializeEdges) { - final int edgeStart = initializeEdges ? 0 : startIndex + 1; - initializeMatrixValuesForTrailingProblem(readStart, readEnd, edgeStart); - updateTable(readStart + 1, readEnd + 1, startIndex + 1, haplotypeLength + 1); - if (readEnd == readBases.length) - return finalLikelihoodCalculation(readEnd,0,haplotypeLength + 1) - (readStart == 0 ? StrictMath.log10(haplotypeLength) : 0); - else { - final double divider = 3.0; - final double dividerInverted = 1.0 / divider; - return StrictMath.log10(matchMatrix[readEnd][haplotypeLength] - * transition[readEnd][matchToMatch] * dividerInverted + - insertionMatrix[readEnd][haplotypeLength] - * transition[readEnd][indelToMatch] * dividerInverted + - deletionMatrix[readEnd][haplotypeLength] - * transition[readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); - } - } - - - private double calculateLocalLikelihoodGeneral(final Problem p) { - - initializeMatrixValues(p,null); - int fromCol = p.hapStart + 1; - // if (previousProblem == null) { - // fromCol = p.hapStart + 1; - // } else { - // final int sharedPrefix = previousProblem.followerStartIndex(p); - // if (sharedPrefix >= 0) - // fromCol = sharedPrefix + 1; - // else - // fromCol = p.hapStart + 1; - // } - // previousProblem = p; - - updateTable(p.readStart + 1, p.readEnd + 1, - fromCol, p.hapEnd + 1); - - if (p.trailing) { - return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1) - - (p.leading ? StrictMath.log10(p.hapEnd - p.hapStart) : 0); - } else { - final double divider = 3.0; - final double dividerInverted = 1.0 / divider; - return StrictMath.log10(matchMatrix[p.readEnd][p.hapEnd] - * transition[p.readEnd][matchToMatch] * dividerInverted + - insertionMatrix[p.readEnd][p.hapEnd] - * transition[p.readEnd][indelToMatch] * dividerInverted + - deletionMatrix[p.readEnd][p.hapEnd] - * transition[p.readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); - } - } - - private void updateTable(final int rowFrom, final int rowTo, - final int colFrom, final int colTo) { - - for (int i = rowFrom; i < rowTo; i++) { - for (int j = colFrom; j < colTo; j++) { - updateCell(i, j, prior[i][j], transition[i]); - } - } - - } - - /** - * Holds the properties of a pair-hmm computational problem. - */ - public class Problem { - private final byte[] haplotypeSegment; - private final int readStart; - private final int readEnd; - private final int hapStart; - private final int hapEnd; - private final int hashCode; - private final boolean trailing; - private final boolean leading; - - /** - * Construct a new project object. - * @param start inclusive start position on the read to consider. - * @param end exclusive after last position on the read to consider. - * @param hapStart inclusive start position on the haplotype to consider. - * @param hapEnd exclusive after last position on the haplotype to consider. - */ - public Problem(final int start, final int end, final int hapStart, - final int hapEnd) { - if (start < 0 || start > readBases.length) - throw new IllegalArgumentException("bad start index " + start); - if (end < start || end > readBases.length) - throw new IllegalArgumentException("bad end index " + end + " < " + start + " or " + end + " > " + readBases.length); - if (hapStart < 0 || hapStart > haplotypeLength) - throw new IllegalArgumentException("bad hap start index " - + hapStart + " is larger than the haplotypeLength " + haplotypeLength); - if (hapEnd < hapStart || hapEnd > haplotypeLength) - throw new IllegalArgumentException("bad hap end index " - + hapEnd + " outside [" + hapStart + "," - + haplotypeLength + "]"); - - haplotypeSegment = Arrays.copyOfRange(haplotypeBases, hapStart, hapEnd); - readStart = start; - readEnd = end; - this.hapStart = hapStart; - this.hapEnd = hapEnd; - trailing = readEnd == readBases.length; - leading = readStart == 0; - - hashCode = ((start * 31 + end) * 31 + Arrays.hashCode(haplotypeSegment) * 31); - } - - @Override - public int hashCode() { - return hashCode; - } - - @Override - public boolean equals(Object o) { - if (o == this) - return true; - else if (o == null) - return false; - else if (o.getClass() != this.getClass()) - return false; - else { - final Problem p = (Problem) o; - return (p.hashCode == this.hashCode) && (p.readStart == this.readStart) && (p.readEnd == this.readEnd) && Arrays.equals(haplotypeSegment, p.haplotypeSegment); - } - } - - - } - - /** - * Returns the currently loaded read base calls. - * @return {@code never null}. - */ - public byte[] getReadBases() { - if (readBases == null) - throw new IllegalStateException("no read was previously loaded."); - return readBases; - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java deleted file mode 100644 index d3d003731..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java +++ /dev/null @@ -1,100 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * API for the fast (partial) HMM calculation engine. - */ -public interface FlexibleHMM { - - /** - * Load a read into the HMM calculation matrices. - * @param read the read record to load into the HMM calculating engine. - */ - public void loadRead(GATKSAMRecord read); - - /** - * Returns the current read bases. - * - * @return never null. - */ - public byte[] getReadBases(); - - /** - * Loads a haplotype bases in the HMM calculation matrices. - * @param haplotype the haplotype sequence. - * - * @throws IllegalStateException if no read has been previously loaded. - * @throws NullPointerException if {@code haplotype} is {@code null}. - */ - public void loadHaplotypeBases(byte[] haplotype); - - /** - * Resolve the partial Fast PairHMM for a section of the read and haplotype - * @param readFrom inclusive offset of the first position on the read. - * @param readTo exclusive offset of the last position on the read. - * @param haplotypeFrom inclusive offset of the first position on the haplotype. - * @param haplotypeTo exclusive offset of the last position on the haplotype. - * @param treatAsMatch can assume that both pieces are the same sequence. - * @return the cost the sub-HMM. - */ - public double calculateLocalLikelihood(int readFrom, int readTo, int haplotypeFrom, int haplotypeTo, boolean treatAsMatch); - - /** - * Load a read given its relevant information pieces by separate. - * @param bases read bases. - * @param bq base qualities. - * @param iq insertion qualities. - * @param dq deletion qualities. - * @param mq read mapping quality. - */ - public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq); - - -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java deleted file mode 100644 index 1feae2bfe..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ /dev/null @@ -1,191 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.QualityUtils; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin, carneiro - * Date: 10/16/12 - */ -public class LoglessPairHMM extends N2MemoryPairHMM { - protected static final double INITIAL_CONDITION = Math.pow(2, 1020); - protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); - - // we divide e by 3 because the observed base could have come from any of the non-observed alleles - protected static final double TRISTATE_CORRECTION = 3.0; - - protected static final int matchToMatch = 0; - protected static final int indelToMatch = 1; - protected static final int matchToInsertion = 2; - protected static final int insertionToInsertion = 3; - protected static final int matchToDeletion = 4; - protected static final int deletionToDeletion = 5; - - - /** - * {@inheritDoc} - */ - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex) { - - if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { - final double initialValue = INITIAL_CONDITION / haplotypeBases.length; - // set the initial value (free deletions in the beginning) for the first row in the deletion matrix - for( int j = 0; j < paddedHaplotypeLength; j++ ) { - deletionMatrix[0][j] = initialValue; - } - } - - if ( ! constantsAreInitialized || recacheReadValues ) { - initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); - - // note that we initialized the constants - constantsAreInitialized = true; - } - - initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - - for (int i = 1; i < paddedReadLength; i++) { - // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based - for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { - updateCell(i, j, prior[i][j], transition[i]); - } - } - - // final probability is the log10 sum of the last element in the Match and Insertion state arrays - // this way we ignore all paths that ended in deletions! (huge) - // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. - final int endI = paddedReadLength - 1; - double finalSumProbabilities = 0.0; - for (int j = 1; j < paddedHaplotypeLength; j++) { - finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; - } - return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10; - } - - /** - * Initializes the matrix that holds all the constants related to the editing - * distance between the read and the haplotype. - * - * @param haplotypeBases the bases of the haplotype - * @param readBases the bases of the read - * @param readQuals the base quality scores of the read - * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) - */ - protected void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { - - // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases - // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. - - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = startIndex; j < haplotypeBases.length; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); - } - } - } - - /** - * Initializes the matrix that holds all the constants related to quality scores. - * - * @param insertionGOP insertion quality scores of the read - * @param deletionGOP deletion quality scores of the read - * @param overallGCP overall gap continuation penalty - */ - @Requires({ - "insertionGOP != null", - "deletionGOP != null", - "overallGCP != null" - }) - @Ensures("constantsAreInitialized") - protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - for (int i = 0; i < insertionGOP.length; i++) { - final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); - transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); - transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); - transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); - transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); - transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); - } - } - - /** - * Updates a cell in the HMM matrix - * - * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the - * initial conditions - - * @param indI row index in the matrices to update - * @param indJ column index in the matrices to update - * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transition an array with the six transition relevant to this location - */ - protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - - matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] + - insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] + - deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] ); - insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion]; - deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion]; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java deleted file mode 100644 index 6cbbbd089..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ /dev/null @@ -1,158 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.LRUCache; - -/** - * The object temporarily held by a read that describes all of it's covariates. - * - * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap - * - * @author Mauricio Carneiro - * @since 2/8/12 - */ -public class ReadCovariates { - private final static Logger logger = Logger.getLogger(ReadCovariates.class); - - /** - * How big should we let the LRU cache grow - */ - private static final int LRU_CACHE_SIZE = 500; - - /** - * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. - * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU - * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. - * - * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE - */ - private final static ThreadLocal> keysCache = new ThreadLocal>() { - @Override protected LRUCache initialValue() { - return new LRUCache(LRU_CACHE_SIZE); - } - }; - - /** - * Our keys, indexed by event type x read length x covariate - */ - private final int[][][] keys; - - /** - * The index of the current covariate, used by addCovariate - */ - private int currentCovariateIndex = 0; - - public ReadCovariates(final int readLength, final int numberOfCovariates) { - final LRUCache cache = keysCache.get(); - final int[][][] cachedKeys = cache.get(readLength); - if ( cachedKeys == null ) { - // There's no cached value for read length so we need to create a new int[][][] array - if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); - keys = new int[EventType.values().length][readLength][numberOfCovariates]; - cache.put(readLength, keys); - } else { - keys = cachedKeys; - } - } - - public void setCovariateIndex(final int index) { - currentCovariateIndex = index; - } - - /** - * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset - * - * @param mismatch the mismatch key value - * @param insertion the insertion key value - * @param deletion the deletion key value - * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates - */ - public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; - } - - /** - * Get the keys for all covariates at read position for error model - * - * @param readPosition - * @param errorModel - * @return - */ - public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.ordinal()][readPosition]; - } - - public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.ordinal()]; - } - - // ---------------------------------------------------------------------- - // - // routines for testing - // - // ---------------------------------------------------------------------- - - protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } - protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } - protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } - - protected int[] getMismatchesKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); - } - - protected int[] getInsertionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_INSERTION); - } - - protected int[] getDeletionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_DELETION); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java deleted file mode 100644 index 56f7e8257..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ /dev/null @@ -1,1082 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.compression.reducereads.ReduceReads; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.R.RScriptExecutor; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.io.Resource; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 6, 2009 - * - * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. - * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. - * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. - */ - -public class RecalUtils { - public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; - public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; - public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; - public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; - public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; - - public final static String ARGUMENT_COLUMN_NAME = "Argument"; - public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; - public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; - public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; - public final static String READGROUP_COLUMN_NAME = "ReadGroup"; - public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; - public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; - public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; - public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; - public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; - public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; - public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; - public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; - - private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullPlatform = false; - - private static final String SCRIPT_FILE = "BQSR.R"; - - private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); - private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); - private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); - private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); - private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); - - /** - * Generates two lists : required covariates and optional covariates based on the user's requests. - * - * Performs the following tasks in order: - * 1. Adds all requierd covariates in order - * 2. Check if the user asked to use the standard covariates and adds them all if that's the case - * 3. Adds all covariates requested by the user that were not already added by the two previous steps - * - * @param argumentCollection the argument collection object for the recalibration walker - * @return a pair of ordered lists : required covariates (first) and optional covariates (second) - */ - public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { - final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); - final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); - final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - - final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates - ArrayList optionalCovariates = new ArrayList(); - if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) - optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user - - // parse the -cov arguments that were provided, skipping over the ones already specified - if (argumentCollection.COVARIATES != null) { - for (String requestedCovariateString : argumentCollection.COVARIATES) { - // help the transition from BQSR v1 to BQSR v2 - if ( requestedCovariateString.equals("DinucCovariate") ) - throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + - "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + - "as well as an indel context to model the indel error rates"); - - boolean foundClass = false; - for (Class covClass : covariateClasses) { - if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class - foundClass = true; - if (!requiredClasses.contains(covClass) && - (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { - try { - final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it - optionalCovariates.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - } - - if (!foundClass) { - throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); - } - } - } - return new Pair, ArrayList>(requiredCovariates, optionalCovariates); - } - - /** - * Adds the required covariates to a covariate list - * - * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addRequiredCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - if (classes.size() != 2) - throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); - - dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. - dest.add(new QualityScoreCovariate()); - return dest; - } - - /** - * Adds the standard covariates to a covariate list - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addStandardCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - for (Class covClass : classes) { - try { - final Covariate covariate = (Covariate) covClass.newInstance(); - dest.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - return dest; - } - - /** - * Print a list of all available covariates to logger as info - * - * @param logger - */ - public static void listAvailableCovariates(final Logger logger) { - logger.info("Available covariates:"); - for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { - logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); - } - } - - /** - * Component used to print out csv representation of the reports that can be use to perform analysis in - * external tools. E.g. generate plots using R scripts. - *

- * A header is always printed into the output stream (or file) when the printer is created. Then you only need - * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file. - * Once finished, you close the printer calling {@link #close() close} - * - */ - private static class CsvPrinter { - - private final PrintStream ps; - private final Covariate[] covariates; - - /** - * Constructs a printer redirected to an output file. - * @param out the output file. - * @param c covariates to print out. - * @throws FileNotFoundException if the file could not be created anew. - */ - protected CsvPrinter(final File out, final Covariate ... c) - throws FileNotFoundException { - this(new FileOutputStream(out), c); - } - - /** - * Constructs a printer redirected to an output stream - * @param os the output. - * @param c covariates to print out. - */ - protected CsvPrinter(final OutputStream os, final Covariate ... c) { - covariates = c == null ? new Covariate[0] : c.clone(); - ps = new PrintStream(os); - printHeader(); - } - - /** - * Prints the header out. - *

- * Should only be invoked at creation. - */ - protected void printHeader() { - RecalUtils.printHeader(ps); - } - - /** - * Prints out a report into the csv file. - * - * - * @param report the report to print out. - * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED - */ - public void print(final RecalibrationReport report, final String mode) { - RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false); - } - - /** - * Close the csv printer. - * - * No further output will be allowed or take place after calling this method. - */ - public void close() { - ps.close(); - } - - } - - /** - * Returns a csv output printer. - * - * @param out the output file. It will be overridden - * @param c list of covariates to print out. - * - * @throws FileNotFoundException if out could not be created anew. - * - * @return never null - */ - protected static CsvPrinter csvPrinter(final File out, final Covariate ... c) - throws FileNotFoundException - { - if (c == null) { - throw new IllegalArgumentException("the input covariate array cannot be null"); - } - return new CsvPrinter(out,c); - } - - /** - * Prints out a collection of reports into a file in Csv format in a way - * that can be used by R scripts (such as the plot generator script). - *

- * The set of covariates is take as the minimum common set from all reports. - * - * @param out the output file. It will be overridden. - * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) - * of each report and the corresponding value the report itself. - * @throws FileNotFoundException if out could not be created anew. - */ - public static void generateCsv(final File out, final Map reports) - throws FileNotFoundException { - if (reports.size() == 0) { - writeCsv(out, reports, new Covariate[0]); - } else { - final Iterator rit = reports.values().iterator(); - final RecalibrationReport first = rit.next(); - final Covariate[] firstCovariates = first.getRequestedCovariates(); - final Set covariates = new LinkedHashSet<>(); - Utils.addAll(covariates,firstCovariates); - while (rit.hasNext() && covariates.size() > 0) { - final Covariate[] nextCovariates = rit.next().getRequestedCovariates(); - final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length); - for (final Covariate nc : nextCovariates) { - nextCovariateNames.add(nc.getClass().getSimpleName()); - } - final Iterator cit = covariates.iterator(); - while (cit.hasNext()) { - if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) { - cit.remove(); - } - } - } - writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()])); - } - } - - /** - * Print out a collection of reports into a file in Csv format in a way - * that can be used by R scripts (such as the plot generator script). - * - * @param out - * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) - * of each report and the corresponding value the report itself. - * @param c the covariates to print out. - * @throws FileNotFoundException if out could not be created anew. - */ - private static void writeCsv(final File out, - final Map reports, final Covariate[] c) - throws FileNotFoundException { - final CsvPrinter p = csvPrinter(out,c); - for (Map.Entry e : reports.entrySet()) { - p.print(e.getValue(),e.getKey()); - } - p.close(); - } - - public enum SOLID_RECAL_MODE { - /** - * Treat reference inserted bases as reference matching bases. Very unsafe! - */ - DO_NOTHING, - /** - * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. - */ - SET_Q_ZERO, - /** - * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. - */ - SET_Q_ZERO_BASE_N, - /** - * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. - */ - REMOVE_REF_BIAS; - - public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { - if (recalMode.equals("DO_NOTHING")) - return SOLID_RECAL_MODE.DO_NOTHING; - if (recalMode.equals("SET_Q_ZERO")) - return SOLID_RECAL_MODE.SET_Q_ZERO; - if (recalMode.equals("SET_Q_ZERO_BASE_N")) - return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; - if (recalMode.equals("REMOVE_REF_BIAS")) - return SOLID_RECAL_MODE.REMOVE_REF_BIAS; - - throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); - } - } - - public enum SOLID_NOCALL_STRATEGY { - /** - * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. - */ - THROW_EXCEPTION, - /** - * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. - */ - LEAVE_READ_UNRECALIBRATED, - /** - * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. - */ - PURGE_READ; - - public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { - if (nocallStrategy.equals("THROW_EXCEPTION")) - return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; - if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) - return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; - if (nocallStrategy.equals("PURGE_READ")) - return SOLID_NOCALL_STRATEGY.PURGE_READ; - - throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); - } - } - - private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - List result = new LinkedList(); - int reportTableIndex = 0; - int rowIndex = 0; - final Map covariateNameMap = new HashMap(requestedCovariates.length); - for (final Covariate covariate : requestedCovariates) - covariateNameMap.put(covariate, parseCovariateName(covariate)); - - for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { - - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - columnNames.add(covariateValue); - columnNames.add(covariateName); - } - } - - columnNames.add(eventType); // the order of these column names is important here - columnNames.add(empiricalQuality); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) - columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported - columnNames.add(nObservations); - columnNames.add(nErrors); - - final GATKReportTable reportTable; - if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - if(sortByCols) { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT); - } - for (final Pair columnName : columnNames) - reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); - rowIndex = 0; // reset the row index since we're starting with a new table - } else { - reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); - } - - final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); - for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) { - final RecalDatum datum = (RecalDatum)row.value; - final int[] keys = row.keys; - - int columnIndex = 0; - int keyIndex = 0; - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - final Covariate covariate = requestedCovariates[tableIndex]; - - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); - } - } - - final EventType event = EventType.eventFrom(keys[keyIndex]); - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); - - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); - reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); - - rowIndex++; - } - result.add(reportTable); - } - - return result; - } - - private static String parseCovariateName(final Covariate covariate) { - return covariate.getClass().getSimpleName().split("Covariate")[0]; - } - - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE); - } - - /** - * Return a human-readable string representing the used covariates - * - * @param requestedCovariates a vector of covariates - * @return a non-null comma-separated string - */ - public static String covariateNames(final Covariate[] requestedCovariates) { - final List names = new ArrayList(requestedCovariates.length); - for ( final Covariate cov : requestedCovariates ) - names.add(cov.getClass().getSimpleName()); - return Utils.join(",", names); - } - - public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) { - outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile); - } - - private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables, final PrintStream outputFile) { - final GATKReport report = new GATKReport(); - report.addTable(argumentTable); - report.addTable(quantizationTable); - report.addTables(recalTables); - report.print(outputFile); - } - - /** s - * Write recalibration plots into a file - * - * @param csvFile location of the intermediary file - * @param exampleReportFile where the report arguments are collected from. - * @param output result plot file name. - */ - public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) { - final RScriptExecutor executor = new RScriptExecutor(); - executor.setExceptOnError(true); - executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFile.getAbsolutePath()); - executor.addArgs(exampleReportFile.getAbsolutePath()); - executor.addArgs(output.getAbsolutePath()); - Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine()); - executor.exec(); - } - - private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) { - - final RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFile.getAbsolutePath()); - executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); - executor.exec(); - } - - /** - * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. - * - * @deprecated - */ - @Deprecated - public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { - generateRecalibrationPlot(RAC, original, null, requestedCovariates); - } - - /** - * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. - * - * @deprecated - */ - @Deprecated - public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { - final PrintStream csvStream; - final File csvTempFile = null; - try { - File csvTmpFile = File.createTempFile("BQSR",".csv"); - csvTmpFile.deleteOnExit(); - csvStream = new PrintStream(csvTmpFile); - } catch (IOException e) { - throw new UserException("Could not create temporary csv file", e); - } - - if ( recalibrated != null ) - writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null); - csvStream.close(); - outputRecalibrationPlot(csvTempFile, RAC); - csvTempFile.delete(); - } - - private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { - - final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); - - // add the quality score table to the delta table - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table - final int[] newCovs = new int[4]; - newCovs[0] = leaf.keys[0]; - newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore - newCovs[2] = leaf.keys[1]; - newCovs[3] = leaf.keys[2]; - addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table - } - - // add the optional covariates to the delta table - for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(i); - for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { - final int[] covs = new int[4]; - covs[0] = leaf.keys[0]; - covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) - covs[2] = leaf.keys[2]; - covs[3] = leaf.keys[3]; - addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table - } - } - - // output the csv file - if (printHeader) { - printHeader(deltaTableFile); - } - - final Map covariateNameMap = new HashMap(requestedCovariates.length); - for (final Covariate covariate : requestedCovariates) - covariateNameMap.put(covariate, parseCovariateName(covariate)); - - // print each data line - for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { - final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); - final RecalDatum deltaDatum = leaf.value; - deltaTableFile.print(Utils.join(",", deltaKeys)); - deltaTableFile.print("," + deltaDatum.stringForCSV()); - deltaTableFile.println("," + recalibrationMode); - } - } - - private static void printHeader(PrintStream out) { - final List header = new LinkedList(); - header.add("ReadGroup"); - header.add("CovariateValue"); - header.add("CovariateName"); - header.add("EventType"); - header.add("Observations"); - header.add("Errors"); - header.add("EmpiricalQuality"); - header.add("AverageReportedQuality"); - header.add("Accuracy"); - header.add("Recalibration"); - out.println(Utils.join(",", header)); - } - - /* - * Return an initialized nested integer array with appropriate dimensions for use with the delta tables - * - * @param recalibrationTables the recal tables - * @param numCovariates the total number of covariates being used - * @return a non-null nested integer array - */ - @Requires("recalibrationTables != null && numCovariates > 0") - @Ensures("result != null") - private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { - - final int[] dimensionsForDeltaTable = new int[4]; - - // initialize the dimensions with those of the qual table to start with - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - final int[] dimensionsOfQualTable = qualTable.getDimensions(); - dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups - dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates - dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; - dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; - - // now, update the dimensions based on the optional covariate tables as needed - for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { - final NestedIntegerArray covTable = recalibrationTables.getTable(i); - final int[] dimensionsOfCovTable = covTable.getDimensions(); - dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); - dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); - } - - return new NestedIntegerArray(dimensionsForDeltaTable); - } - - protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { - final List values = new ArrayList(4); - values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); - - final int covariateIndex = keys[1]; - final int covariateKey = keys[2]; - final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; - values.add(covariate.formatKey(covariateKey)); - values.add(covariateNameMap.get(covariate)); - values.add(EventType.eventFrom(keys[3]).prettyPrint()); - - return values; - } - - /** - * Updates the current RecalDatum element in the delta table. - * - * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. - * - * @param deltaTable the delta table - * @param deltaKey the key to the table - * @param recalDatum the recal datum to combine with the accuracyDatum element in the table - */ - private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { - final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key - if (deltaDatum == null) - // if we don't have a key yet, create a new one with the same values as the current datum - deltaTable.put(new RecalDatum(recalDatum), deltaKey); - else - // if we do have a datum, combine it with this one - deltaDatum.combine(recalDatum); - } - - /** - * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string - * - * @param read The read to adjust - * @param RAC The list of shared command line arguments - */ - public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { - GATKSAMReadGroupRecord readGroup = read.getReadGroup(); - - if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { - readGroup.setPlatform(RAC.FORCE_PLATFORM); - } - - if (readGroup.getPlatform() == null) { - if (RAC.DEFAULT_PLATFORM != null) { - if (!warnUserNullPlatform) { - Utils.warnUser("The input .bam file contains reads with no platform information. " + - "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName()); - warnUserNullPlatform = true; - } - readGroup.setPlatform(RAC.DEFAULT_PLATFORM); - } - else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); - } - } - } - - /** - * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are - * inconsistent with the color space. If there is a no call in the color space, this method returns false meaning - * this read should be skipped - * - * @param strategy the strategy used for SOLID no calls - * @param read The SAMRecord to parse - * @return true if this read is consistent or false if this read should be skipped - */ - public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { - if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - return true; - - // Haven't calculated the inconsistency array yet for this read - if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { - final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) - colorSpace = ((String) attr).getBytes(); - else - throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - - final boolean badColor = hasNoCallInColorSpace(colorSpace); - if (badColor) { - if (strategy == SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { - return false; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them - } - else if (strategy == SOLID_NOCALL_STRATEGY.PURGE_READ) { - read.setReadFailsVendorQualityCheckFlag(true); - return false; - } - } - - byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - if (read.getReadNegativeStrandFlag()) - readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - - final byte[] inconsistency = new byte[readBases.length]; - int i; - byte prevBase = colorSpace[0]; // The sentinel - for (i = 0; i < readBases.length; i++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); - inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); - prevBase = readBases[i]; - } - read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); - } - else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - - else - return false; // otherwise, just skip the read - } - - return true; - } - - private static boolean hasNoCallInColorSpace(final byte[] colorSpace) { - final int length = colorSpace.length; - for (int i = 1; i < length; i++) { // skip the sentinal - final byte color = colorSpace[i]; - if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { - return true; // There is a bad color in this SOLiD read - } - } - - return false; // There aren't any color no calls in this SOLiD read - } - - /** - * Given the base and the color calculate the next base in the sequence - * - * @param read the read - * @param prevBase The base - * @param color The color - * @return The next base in the sequence - */ - private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { - switch (color) { - case '0': - return prevBase; - case '1': - return performColorOne(prevBase); - case '2': - return performColorTwo(prevBase); - case '3': - return performColorThree(prevBase); - default: - throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + - " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); - } - } - - /** - * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * - * @param read The read which contains the color space to check against - * @param offset The offset in the read at which to check - * @return Returns true if the base was inconsistent with the color space - */ - public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { - final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); - if (attr != null) { - final byte[] inconsistency = (byte[]) attr; - // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! - if (read.getReadNegativeStrandFlag()) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] == (byte) 0; - } - else { // Forward direction - return inconsistency[offset] == (byte) 0; - } - - // This block of code is for if you want to check both the offset and the next base for color space inconsistency - //if( read.getReadNegativeStrandFlag() ) { // Negative direction - // if( offset == 0 ) { - // return inconsistency[0] != 0; - // } else { - // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); - // } - //} else { // Forward direction - // if( offset == inconsistency.length - 1 ) { - // return inconsistency[inconsistency.length - 1] != 0; - // } else { - // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); - // } - //} - - } - else { // No inconsistency array, so nothing is inconsistent - return true; - } - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * It populates an array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - * - * @param read The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @return a matrix with all the covariates calculated for every base in the read - */ - public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) { - final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length); - computeCovariates(read, requestedCovariates, readCovariates); - return readCovariates; - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * It populates an array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - * - * @param read The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @param resultsStorage The object to store the covariate values - */ - public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for (int i = 0; i < requestedCovariates.length; i++) { - resultsStorage.setCovariateIndex(i); - requestedCovariates[i].recordValues(read, resultsStorage); - } - } - - /** - * Perform a certain transversion (A <-> C or G <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transversion of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorOne(byte base) { - switch (base) { - case 'A': - case 'a': - return 'C'; - case 'C': - case 'c': - return 'A'; - case 'G': - case 'g': - return 'T'; - case 'T': - case 't': - return 'G'; - default: - return base; - } - } - - /** - * Perform a transition (A <-> G or C <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transition of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorTwo(byte base) { - switch (base) { - case 'A': - case 'a': - return 'G'; - case 'C': - case 'c': - return 'T'; - case 'G': - case 'g': - return 'A'; - case 'T': - case 't': - return 'C'; - default: - return base; - } - } - - /** - * Return the complement (A <-> T or C <-> G) of a base. - * - * @param base the base [AaCcGgTt] - * @return the complementary base, or the input base if it's not one of the understood ones - */ - private static byte performColorThree(byte base) { - switch (base) { - case 'A': - case 'a': - return 'T'; - case 'C': - case 'c': - return 'G'; - case 'G': - case 'g': - return 'C'; - case 'T': - case 't': - return 'A'; - default: - return base; - } - } - - /** - * Combines the recalibration data for table1 and table2 into table1 - * - * Note that table1 is the destination, so it is modified - * - * @param table1 the destination table to merge table2 into - * @param table2 the source table to merge into table1 - */ - public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { - if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); - if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); - if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) - throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); - - for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { - final RecalDatum myDatum = table1.get(row.keys); - - if (myDatum == null) - table1.put(row.value, row.keys); - else - myDatum.combine(row.value); - } - } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1L, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1L, isError); - } - } - - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } - - /** - * Checks for invalid BAMs that are being used with BQSR and fails with a UserException if it finds one - * - * @param headers sam file headers being passed into the GATK engine - * @param allowBqsrOnReducedBams should we allow BQSR on reduced bams? - */ - public static void checkForInvalidRecalBams(final List headers, final boolean allowBqsrOnReducedBams) { - // for now, the only check we make is against reduced bams - if ( !allowBqsrOnReducedBams ) { - for ( final SAMFileHeader header : headers ) { - if ( header.getProgramRecord(ReduceReads.PROGRAM_RECORD_NAME) != null ) - throw new UserException.BadInput("base quality score recalibration should absolutely not be run on reduced BAM files! Please run ReduceReads only after BQSR has been performed"); - } - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java deleted file mode 100644 index fec83e1a8..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java +++ /dev/null @@ -1,151 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.walkers.compression.reducereads.*; -import org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts; -import org.broadinstitute.sting.utils.MannWhitneyU; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class RankSumUnitTest { - - List distribution20, distribution30, distribution20_40; - static final int observations = 100; - - @BeforeClass - public void init() { - distribution20 = new ArrayList<>(observations); - distribution30 = new ArrayList<>(observations); - distribution20_40 = new ArrayList<>(observations); - - final int skew = 3; - makeDistribution(distribution20, 20, skew, observations); - makeDistribution(distribution30, 30, skew, observations); - makeDistribution(distribution20_40, 20, skew, observations/2); - makeDistribution(distribution20_40, 40, skew, observations/2); - - // shuffle the observations - Collections.shuffle(distribution20); - Collections.shuffle(distribution30); - Collections.shuffle(distribution20_40); - } - - private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) { - final int rangeStart = target - skew; - final int rangeEnd = target + skew; - - int current = rangeStart; - for ( int i = 0; i < numObservations; i++ ) { - result.add(current++); - if ( current > rangeEnd ) - current = rangeStart; - } - } - - @DataProvider(name = "DistributionData") - public Object[][] makeDistributionData() { - List tests = new ArrayList(); - - for ( final int numToReduce : Arrays.asList(0, 10, 50, 100) ) { - tests.add(new Object[]{distribution20, distribution20, numToReduce, true, "20-20"}); - tests.add(new Object[]{distribution30, distribution30, numToReduce, true, "30-30"}); - tests.add(new Object[]{distribution20_40, distribution20_40, numToReduce, true, "20/40-20/40"}); - - tests.add(new Object[]{distribution20, distribution30, numToReduce, false, "20-30"}); - tests.add(new Object[]{distribution30, distribution20, numToReduce, false, "30-20"}); - - tests.add(new Object[]{distribution20, distribution20_40, numToReduce, false, "20-20/40"}); - tests.add(new Object[]{distribution30, distribution20_40, numToReduce, true, "30-20/40"}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "DistributionData") - public void testDistribution(final List distribution1, final List distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) { - final MannWhitneyU mannWhitneyU = new MannWhitneyU(true); - - for ( final Integer num : distribution1 ) - mannWhitneyU.add(num, MannWhitneyU.USet.SET1); - - final List dist2 = new ArrayList<>(distribution2); - if ( numToReduceIn2 > 0 ) { - final org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts counts = new BaseCounts(); - for ( int i = 0; i < numToReduceIn2; i++ ) { - final int value = dist2.remove(0); - counts.incr(BaseIndex.A, (byte)value, 0, false); - } - - final int qual = (int)counts.averageQualsOfBase(BaseIndex.A); - for ( int i = 0; i < numToReduceIn2; i++ ) - dist2.add(qual); - } - - for ( final Integer num : dist2 ) - mannWhitneyU.add(num, MannWhitneyU.USet.SET2); - - final Double result = mannWhitneyU.runTwoSidedTest().second; - Assert.assertFalse(Double.isNaN(result)); - - if ( distributionsShouldBeEqual ) { - // TODO -- THIS IS THE FAILURE POINT OF USING REDUCED READS WITH RANK SUM TESTS - if ( numToReduceIn2 >= observations / 2 ) - return; - Assert.assertTrue(result > 0.1, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0))); - } else { - Assert.assertTrue(result < 0.01, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0))); - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java deleted file mode 100644 index 58c3bb9bd..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ /dev/null @@ -1,396 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.Arrays; - -public class VariantAnnotatorIntegrationTest extends WalkerTest { - - final static String REF = b37KGReference; - final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - - public static String baseTestString() { - return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; - } - - @Test - public void testHasAnnotsNotAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("360610e4990860bb5c45249b8ac31e5b")); - executeTest("test file has annotations, not asking for annotations, #1", spec); - } - - @Test - public void testHasAnnotsNotAsking2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("d69a3c92a0e8f44e09e7377e3eaed4e8")); - executeTest("test file has annotations, not asking for annotations, #2", spec); - } - - @Test - public void testHasAnnotsAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("823868a4b5b5ec2cdf080c059d04d31a")); - executeTest("test file has annotations, asking for annotations, #1", spec); - } - - @Test - public void testHasAnnotsAsking2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("213560f395280e6a066d0b0497ce8881")); - executeTest("test file has annotations, asking for annotations, #2", spec); - } - - @Test - public void testNoAnnotsNotAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("540a9be8a8cb85b0f675fea1184bf78c")); - executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); - } - - @Test - public void testNoAnnotsNotAsking2() { - // the genotype annotations in this file are actually out of order. If you don't parse the genotypes - // they don't get reordered. It's a good test of the genotype ordering system. - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("f900e65b65ff0f9d9eb0891ef9b28c73")); - executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); - } - - @Test - public void testNoAnnotsAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("6f873b3152db291e18e3a04fbce2e117")); - executeTest("test file doesn't have annotations, asking for annotations, #1", spec); - } - - @Test - public void testNoAnnotsAsking2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("d8089c5874ff35a7fd7e35ebd7d3b137")); - executeTest("test file doesn't have annotations, asking for annotations, #2", spec); - } - - @Test - public void testExcludeAnnotations() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("552c2ad9dbfaa85d51d2def93c8229c6")); - executeTest("test exclude annotations", spec); - } - - @Test - public void testOverwritingHeader() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("0ed4c7760f6e7a158b6d743d257300f3")); - executeTest("test overwriting header", spec); - } - - @Test - public void testNoReads() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("1c423b7730b9805e7b885ece924286e0")); - executeTest("not passing it any reads", spec); - } - - @Test - public void testDBTagWithDbsnp() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); - executeTest("getting DB tag with dbSNP", spec); - } - - @Test - public void testMultipleIdsWithDbsnp() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); - executeTest("adding multiple IDs with dbSNP", spec); - } - - @Test - public void testDBTagWithHapMap() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("cc7184263975595a6e2473d153227146")); - executeTest("getting DB tag with HM3", spec); - } - - @Test - public void testDBTagWithTwoComps() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4")); - executeTest("getting DB tag with 2 comps", spec); - } - - @Test - public void testNoQuals() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("aea983adc01cd059193538cc30adc17d")); - executeTest("test file doesn't have QUALs", spec); - } - - @Test - public void testUsingExpression() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); - executeTest("using expression", spec); - } - - @Test - public void testUsingExpressionWithID() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("3de1d1998203518098ffae233f3e2352")); - executeTest("using expression with ID", spec); - } - - @Test - public void testTabixAnnotationsAndParallelism() { - final String MD5 = "99938d1e197b8f10c408cac490a00a62"; - for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, - Arrays.asList(MD5)); - executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); - } - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, - Arrays.asList(MD5)); - - executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec); - } - - @Test - public void testSnpEffAnnotations() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + hg19Reference + " --no_cmdline_in_header -o %s -A SnpEff --variant " + - validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", - 1, - Arrays.asList("d9291845ce5a8576898d293a829a05b7") - ); - executeTest("Testing SnpEff annotations", spec); - } - - @Test - public void testSnpEffAnnotationsUnsupportedVersionGATKMode() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + - "--variant " + privateTestDir + "vcf4.1.example.vcf " + - "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_gatk_mode.vcf " + - "-L 1:10001292-10012424", - 1, - Arrays.asList("7352cf23a4d45d3d2bb34ab44a4100ae") - ); - executeTest("Testing SnpEff annotations (unsupported version, GATK mode)", spec); - } - - @Test - public void testSnpEffAnnotationsUnsupportedVersionNoGATKMode() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + - "--variant " + privateTestDir + "vcf4.1.example.vcf " + - "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " + - "-L 1:10001292-10012424", - 1, - UserException.class - ); - executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec); - } - - @Test - public void testTDTAnnotation() { - final String MD5 = "427dfdc665359b67eff210f909ebf8a2"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + - " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, - Arrays.asList(MD5)); - executeTest("Testing TDT annotation ", spec); - } - - - @Test - public void testChromosomeCountsPed() { - final String MD5 = "6b5cbedf4a8b3385edf128d81c8a46f2"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + - " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, - Arrays.asList(MD5)); - executeTest("Testing ChromosomeCounts annotation with PED file", spec); - } - - @Test - public void testInbreedingCoeffPed() { - final String MD5 = "159a771c1deaeffb786097e106943893"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + - " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, - Arrays.asList(MD5)); - executeTest("Testing InbreedingCoeff annotation with PED file", spec); - } - - @Test - public void testStrandBiasBySample() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); - - final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; - final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); - specNoFS.disableShadowBCF(); - final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); - - final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; - final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); - specAnn.disableShadowBCF(); - final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); - - // confirm that the FisherStrand values are identical for the two pipelines - final VCFCodec codec = new VCFCodec(); - final FileInputStream s = new FileInputStream(outputVCF); - final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); - codec.readHeader(lineIterator); - - final VCFCodec codecAnn = new VCFCodec(); - final FileInputStream sAnn = new FileInputStream(outputVCFAnn); - final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); - codecAnn.readHeader(lineIteratorAnn); - - while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { - final String line = lineIterator.next(); - Assert.assertFalse(line == null); - final VariantContext vc = codec.decode(line); - - final String lineAnn = lineIteratorAnn.next(); - Assert.assertFalse(lineAnn == null); - final VariantContext vcAnn = codecAnn.decode(lineAnn); - - Assert.assertTrue(vc.hasAttribute("FS")); - Assert.assertTrue(vcAnn.hasAttribute("FS")); - Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); - } - - Assert.assertFalse(lineIterator.hasNext()); - Assert.assertFalse(lineIteratorAnn.hasNext()); - } - - @Test - public void testQualByDepth() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0); - - final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth"; - final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList("")); - specNoQD.disableShadowBCF(); - final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0); - - final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth"; - final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249")); - specAnn.disableShadowBCF(); - final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0); - - // confirm that the QD values are present in the new file for all biallelic variants - // QD values won't be identical because some filtered reads are missing during re-annotation - - final VCFCodec codec = new VCFCodec(); - final FileInputStream s = new FileInputStream(outputVCF); - final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); - codec.readHeader(lineIterator); - - final VCFCodec codecAnn = new VCFCodec(); - final FileInputStream sAnn = new FileInputStream(outputVCFAnn); - final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); - codecAnn.readHeader(lineIteratorAnn); - - while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { - final String line = lineIterator.next(); - Assert.assertFalse(line == null); - final VariantContext vc = codec.decode(line); - - final String lineAnn = lineIteratorAnn.next(); - Assert.assertFalse(lineAnn == null); - final VariantContext vcAnn = codecAnn.decode(lineAnn); - - if( vc.isBiallelic() ) { - Assert.assertTrue(vc.hasAttribute("QD")); - Assert.assertTrue(vcAnn.hasAttribute("QD")); - } - } - - Assert.assertFalse(lineIterator.hasNext()); - Assert.assertFalse(lineIteratorAnn.hasNext()); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java deleted file mode 100644 index fb15e9835..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class WalkerTestIntegrationTest extends WalkerTest { - - public void testBadMD5(String md5) { - WalkerTestSpec spec = new WalkerTestSpec("FAIL", Arrays.asList(md5)); - executeTest("", spec); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testNullMD5() { - testBadMD5(null); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testBadLengthMD5() { - testBadMD5("asdfasdfa"); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testSpacesMD5() { - testBadMD5("1de8e943fbf55246ebd19efa32f22a58 "); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testBadCharMD5() { - testBadMD5("1de8e943fbf55246ebd19efa32f22a5_"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java deleted file mode 100644 index 12fa2525f..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumMap; -import java.util.List; - -public final class ReadRecalibrationInfoUnitTest extends BaseTest { - @DataProvider(name = "InfoProvider") - public Object[][] createCombineTablesProvider() { - List tests = new ArrayList(); - - for ( final int readLength: Arrays.asList(10, 100, 1000) ) { - for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { - tests.add(new Object[]{readLength, includeIndelErrors}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "InfoProvider") - public void testReadInfo(final int readLength, final boolean includeIndelErrors) { - final ReadCovariates covariates = new ReadCovariates(readLength, 2); - - final byte[] bases = new byte[readLength]; - final byte[] baseQuals = new byte[readLength]; - final byte[] insertionQuals = new byte[readLength]; - final byte[] deletionQuals = new byte[readLength]; - final boolean[] skips = new boolean[readLength]; - final double[] snpErrors = new double[readLength]; - final double[] insertionErrors = new double[readLength]; - final double[] deletionsErrors = new double[readLength]; - for ( int i = 0; i < readLength; i++ ) { - bases[i] = 'A'; - baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); - insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); - deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); - skips[i] = i % 2 == 0; - snpErrors[i] = 1.0 / (i+1); - insertionErrors[i] = 0.5 / (i+1); - deletionsErrors[i] = 0.3 / (i+1); - } - - final EnumMap errors = new EnumMap(EventType.class); - errors.put(EventType.BASE_SUBSTITUTION, snpErrors); - errors.put(EventType.BASE_INSERTION, insertionErrors); - errors.put(EventType.BASE_DELETION, deletionsErrors); - - final EnumMap quals = new EnumMap(EventType.class); - quals.put(EventType.BASE_SUBSTITUTION, baseQuals); - quals.put(EventType.BASE_INSERTION, insertionQuals); - quals.put(EventType.BASE_DELETION, deletionQuals); - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); - if ( includeIndelErrors ) { - read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); - read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); - } - - final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); - - Assert.assertEquals(info.getCovariatesValues(), covariates); - Assert.assertEquals(info.getRead(), read); - - for ( int i = 0; i < readLength; i++ ) { - Assert.assertEquals(info.skip(i), skips[i]); - for ( final EventType et : EventType.values() ) { - Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); - final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; - Assert.assertEquals(info.getQual(et, i), expectedQual); - } - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java deleted file mode 100644 index f988471a0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ /dev/null @@ -1,201 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Basic unit test for BaseCounts in reduced reads - */ -public class BaseCountsUnitTest extends BaseTest { - - private class BaseCountsTest { - public String bases; - public byte mostCountBase; - public int mostCommonCount; - - private BaseCountsTest(String bases, char mostCountBase, int mostCommonCount) { - this.mostCommonCount = mostCommonCount; - this.mostCountBase = (byte)mostCountBase; - this.bases = bases; - } - } - - @DataProvider(name = "counting") - public Object[][] createCountingData() { - List params = new ArrayList(); - - params.add(new BaseCountsTest("A", 'A', 1 )); - params.add(new BaseCountsTest("AA", 'A', 2 )); - params.add(new BaseCountsTest("AC", 'A', 1 )); - params.add(new BaseCountsTest("AAC", 'A', 2 )); - params.add(new BaseCountsTest("AAA", 'A', 3 )); - params.add(new BaseCountsTest("AAAN", 'A', 3 )); - params.add(new BaseCountsTest("AAANNNN", 'N', 4 )); - params.add(new BaseCountsTest("AACTG", 'A', 2 )); - params.add(new BaseCountsTest("D", 'D', 1 )); - params.add(new BaseCountsTest("DDAAD", 'D', 3)); - params.add(new BaseCountsTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 )); - params.add(new BaseCountsTest("AAIIIAI", 'I', 4 )); - - List params2 = new ArrayList(); - for ( BaseCountsTest x : params ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @Test(dataProvider = "counting", enabled = true) - public void testCounting(BaseCountsTest params) { - BaseCounts counts = new BaseCounts(); - - for ( byte base : params.bases.getBytes() ) - counts.incr(base); - - String name = String.format("Test-%s", params.bases); - Assert.assertEquals(counts.totalCount(), params.bases.length(), name); - Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name); - Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); - - // test the static creation - final int[] countsArray = new int[] { counts.countOfBase(BaseIndex.A), counts.countOfBase(BaseIndex.C), - counts.countOfBase(BaseIndex.G), counts.countOfBase(BaseIndex.T)}; - final BaseCounts countsFromArray = BaseCounts.createWithCounts(countsArray); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A)); - Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C)); - Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G)); - Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T)); - Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount()); - - // test addition - counts.add(countsFromArray); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), 2 * countsFromArray.countOfBase(BaseIndex.A)); - Assert.assertEquals(counts.countOfBase(BaseIndex.C), 2 * countsFromArray.countOfBase(BaseIndex.C)); - Assert.assertEquals(counts.countOfBase(BaseIndex.G), 2 * countsFromArray.countOfBase(BaseIndex.G)); - Assert.assertEquals(counts.countOfBase(BaseIndex.T), 2 * countsFromArray.countOfBase(BaseIndex.T)); - Assert.assertEquals(ACGTcounts(counts), 2 * countsFromArray.totalCount()); - - // test subtraction - counts.sub(countsFromArray); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A)); - Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C)); - Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G)); - Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T)); - Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount()); - - // test decrementing - if ( counts.countOfBase(BaseIndex.A) > 0 ) { - counts.decr((byte)'A'); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A) - 1); - } - } - - private static int ACGTcounts(final BaseCounts baseCounts) { - return baseCounts.totalCountWithoutIndels() - baseCounts.countOfBase(BaseIndex.N); - } - - - ////////////////////////////////// - // TEST FOR QUALS IN BASECOUNTS // - ////////////////////////////////// - - private class BaseCountsQualsTest { - public final List quals; - - private BaseCountsQualsTest(final List quals) { - this.quals = quals; - } - } - - @DataProvider(name = "quals") - public Object[][] createQualsData() { - List tests = new ArrayList(); - - final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 }; - - for ( final int qual1 : quals ) { - for ( final int qual2 : quals ) { - for ( final int qual3 : quals ) { - tests.add(new Object[]{new BaseCountsQualsTest(Arrays.asList(qual1, qual2, qual3))}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "quals", enabled = true) - public void testQuals(BaseCountsQualsTest test) { - BaseCounts counts = new BaseCounts(); - - for ( int qual : test.quals ) - counts.incr(BaseIndex.A, (byte)qual, 20, false); - - final int actualSum = (int)counts.getSumQuals((byte)'A'); - final int expectedSum = qualSum(test.quals); - Assert.assertEquals(actualSum, expectedSum); - - final int actualAverage = (int)counts.averageQuals((byte)'A'); - Assert.assertEquals(actualAverage, expectedSum / test.quals.size()); - - // test both proportion methods - Assert.assertEquals(counts.baseCountProportion(BaseIndex.A), counts.baseCountProportion((byte)'A')); - } - - private static int qualSum(final List quals) { - int sum = 0; - for ( final int qual : quals ) - sum += qual; - return sum; - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java deleted file mode 100644 index 4f5b7477c..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; - -public class HeaderElementUnitTest extends BaseTest { - - private class HETest { - public byte base, baseQual, insQual, delQual; - public int MQ; - public boolean isClip; - - private HETest(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int MQ, final boolean isClip) { - this.base = base; - this.baseQual = baseQual; - this.insQual = insQual; - this.delQual = delQual; - this.MQ = MQ; - this.isClip = isClip; - } - } - - private static final byte byteA = (byte)'A'; - private static final byte byte10 = (byte)10; - private static final byte byte20 = (byte)20; - private static final int minBaseQual = 20; - private static final int minMappingQual = 20; - - @DataProvider(name = "data") - public Object[][] createData() { - List tests = new ArrayList(); - - tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, false)}); - tests.add(new Object[]{new HETest(byteA, byte10, byte20, byte20, 20, false)}); - tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 10, false)}); - tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, true)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "data", enabled = true) - public void testHE(HETest test) { - - HeaderElement headerElement = new HeaderElement(1000, 0); - - // first test that if we add and then remove it, we have no data - headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); - headerElement.addInsertionToTheRight(); - headerElement.removeBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); - headerElement.removeInsertionToTheRight(); - testHeaderIsEmpty(headerElement); - - // now, test that the data was added as expected - for ( int i = 0; i < 10; i++ ) - headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); - testHeaderData(headerElement, test); - - // test the insertion adding functionality - for ( int i = 0; i < 10; i++ ) - headerElement.addInsertionToTheRight(); - Assert.assertEquals(headerElement.numInsertionsToTheRight(), 10); - } - - private void testHeaderIsEmpty(final HeaderElement headerElement) { - Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS)); - Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED)); - Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertTrue(headerElement.isEmpty()); - } - - private void testHeaderData(final HeaderElement headerElement, final HETest test) { - Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip); - Assert.assertFalse(headerElement.isEmpty()); - Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS), test.MQ >= minMappingQual); - Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED), test.MQ < minMappingQual); - Assert.assertEquals(headerElement.getBaseCounts(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) ? SlidingWindow.ConsensusType.POSITIVE_CONSENSUS : SlidingWindow.ConsensusType.FILTERED).getRMS(), (double)test.MQ); - Assert.assertFalse(headerElement.isVariantFromMismatches(0.05, 0.05)); - Assert.assertEquals(headerElement.isVariant(0.05, 0.05, 0.05), test.isClip); - } - - - private class AllelesTest { - public final int[] counts; - public final double pvalue; - - private AllelesTest(final int[] counts, final double pvalue) { - this.counts = counts; - this.pvalue = pvalue; - } - } - - @DataProvider(name = "alleles") - public Object[][] createAllelesData() { - List tests = new ArrayList<>(); - - final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; - final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 }; - - for ( final int countA : counts ) { - for ( final int countC : counts ) { - for ( final int countG : counts ) { - for ( final int countT : counts ) { - for ( final int countD : counts ) { - for ( final double pvalue : pvalues ) { - tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)}); - } - } - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "alleles", enabled = true) - public void testAlleles(AllelesTest test) { - - HeaderElement headerElement = new HeaderElement(1000, 0); - for ( int i = 0; i < test.counts.length; i++ ) { - final BaseIndex base = BaseIndex.values()[i]; - for ( int j = 0; j < test.counts[i]; j++ ) - headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false, false); - } - - final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue, test.pvalue); - final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue); - - Assert.assertEquals(nAllelesSeen, nAllelesExpected); - } - - private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) { - int total = 0; - for ( final int count : counts ) { - total += count; - } - - int result = 0; - for ( int index = 0; index < counts.length; index++ ) { - final int count = counts[index]; - if ( count == 0 ) - continue; - - final boolean isSignificant; - if ( count <= HeaderElement.MIN_COUNT_FOR_USING_PVALUE ) { - isSignificant = MathUtils.binomialCumulativeProbability(total, 0, count) > targetPvalue; - } else { - isSignificant = (count >= targetPvalue * total); - } - - if ( isSignificant ) { - if ( index == BaseIndex.D.index ) - return -1; - result++; - } - } - - return result; - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java deleted file mode 100644 index 067f36d58..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ /dev/null @@ -1,347 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class ReduceReadsIntegrationTest extends WalkerTest { - final static String REF = b37KGReference; - final static String DBSNP = b37dbSNP132; - final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam"; - final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam"; - final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556"; - final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam"; - final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals"; - final String L = " -L 20:10,100,000-10,120,000 "; - final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam"; - final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam"; - final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057"; - final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam"; - final String HIGH_COVERAGE_BAM = privateTestDir + "NA20313.highCoverageRegion.bam"; - final String HIGH_COVERAGE_L = " -L 1:1650830-1650870"; - final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; - final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; - - final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; - - protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { - return executeTest(name, spec, emptyFileMd5); - } - - protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) { - final Pair, List> result = super.executeTest(name, spec); - - // perform some Reduce Reads specific testing now - if ( result != null ) { - - // generate a new command-line based on the old one - spec.disableImplicitArgs(); - final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" "); - - final StringBuilder reducedInputs = new StringBuilder(); - for ( final File file : result.getFirst() ) { - reducedInputs.append(" -I:reduced "); - reducedInputs.append(file.getAbsolutePath()); - } - - // the coverage test is a less stricter version of the quals test so we can safely ignore it for now - //final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); - //super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); - - // run the quals test - final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); - super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5))); - } - - return result; - } - - /* - * Generate a new command-line based on the old one - * - * @param walkerName the new walker name to use - * @param originalArgs the original arguments used for the test - * @return the new command line - */ - private String createCommandLine(final String walkerName, final String[] originalArgs) { - - final StringBuilder newArgs = new StringBuilder(); - - for ( int i = 0; i < originalArgs.length; i++ ) { - final String arg = originalArgs[i]; - - if ( arg.equals("-T") ) { - newArgs.append("-T "); - newArgs.append(walkerName); - } else if ( arg.startsWith("-I") ) { - newArgs.append("-I:original "); - newArgs.append(originalArgs[++i]); - } else if ( arg.equals("-R") || arg.equals("-L") ) { - newArgs.append(arg); - newArgs.append(" "); - newArgs.append(originalArgs[++i]); - } - - // always add a trailing space - newArgs.append(" "); - } - - newArgs.append("-o %s"); - - return newArgs.toString(); - } - - protected Pair, List> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) { - return super.executeTest(name, spec); - } - - private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { - this.RRTest(testName, args, md5, useKnowns, emptyFileMd5); - } - - private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; - WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); - executeTest(testName, spec, qualsTestMD5); - } - - @Test(enabled = true) - public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "0e503f7b79ace4c89d74f0943a0de1c0", false); - } - - @Test(enabled = true) - public void testDefaultCompressionWithKnowns() { - RRTest("testDefaultCompressionWithKnowns ", L, "6db7ce2733d006f8bd61c42a40d23728", true); - } - - private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - - @Test(enabled = true) - public void testMultipleIntervals() { - RRTest("testMultipleIntervals ", intervals, "207f2c6d3db956e19412a45a231ca367", false, "043b2838c27d8f9580379b54c18ff40a"); - } - - @Test(enabled = true) - public void testMultipleIntervalsWithKnowns() { - RRTest("testMultipleIntervalsWithKnowns ", intervals, "f3b11a8a7673b301e27137936fafc6b6", true, "043b2838c27d8f9580379b54c18ff40a"); - } - - @Test(enabled = true) - public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "dcc3716b3665aa1c2dbe6b22d6534aef", false); - } - - @Test(enabled = true) - public void testHighCompressionWithKnowns() { - RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "97ae655bf0e483ea227b1aac67ced024", true); - } - - @Test(enabled = true) - public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a1377eb922e0b09a03a280b691b0b3ff", false); - } - - @Test(enabled = true) - public void testLowCompressionWithKnowns() { - RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "bd7c5b0b210694f364ca6a41f5b89870", true); - } - - @Test(enabled = true) - public void testBadPvalueInput() { - final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01"; - WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class); - executeTest("testBadPvalueInput", spec); - } - - @Test(enabled = true) - public void testIndelCompression() { - final String md5 = "9c9305eda5e4e7f22246ec8a4b242c97"; - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); - RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); - } - - @Test(enabled = true) - public void testFilteredDeletionCompression() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("1bda512143be1016dfaca1f7020b6398")), "4f916da29d91852077f0a2fdbdd2c7f6"); - } - - @Test(enabled = true) - public void testCoReduction() { - String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("58c2bae5a339af2ea3c22a46ce8faa68"))); - } - - @Test(enabled = true) - public void testCoReductionWithKnowns() { - String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5c251932b49d99a810581e3a6f762878"))); - } - - @Test(enabled = true) - public void testInsertionsAtEdgeOfConsensus() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("c10653a8c21fb32b5cf580d3704b0edd"))); - } - - /** - * Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead - * and a subsequent tail leaves only this read in the stash. The next read to come in is in fact - * before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error - * that was freaking hard to catch. - * - * This bam is simplified to replicate the exact bug with the three provided intervals. - */ - @Test(enabled = true) - public void testAddingReadAfterTailingTheStash() { - String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fddbec29d0945afbbb34b42994614c15")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); - } - - /** - * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get - * filtered out. - */ - @Test(enabled = true) - public void testDivideByZero() { - String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7dfe2647992ce1154db340fc742d523a"))); - } - - /** - * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will - * break the current hard-clipping routine that protects reduce reads from such reads. - */ - @Test(enabled = true) - public void testReadOffContig() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("595e5812c37189930cae93e45765def4"))); - } - - /** - * Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same. - */ - @Test(enabled = true) - public void testPairedReadsInVariantRegion() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + - " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b005727119eee27995705959a637085e")), "2af063d1bd3c322b03405dbb3ecf59a9"); - } - - /** - * Confirm that this bam does not fail when multi-sample mode is enabled. The provided example is tricky and used to cause - * us to exception out in the code. - */ - @Test(enabled = true) - public void testMultiSampleDoesNotFailWithFlag() { - String cmd = "-T ReduceReads --cancer_mode -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null"; - executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, Collections.emptyList())); - } - - /** - * Confirm that this bam fails when multi-sample mode is not enabled - */ - @Test(enabled = true) - public void testMultiSampleFailsWithoutFlag() { - String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null"; - executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class)); - } - - /** - * Confirm that compression is not capping coverage counts to max byte - */ - @Test(enabled = true) - public void testCompressionWorksForHighDepth() { - final String base = String.format("-T ReduceReads -npt -R %s -I %s %s", b37KGReference, HIGH_COVERAGE_BAM, HIGH_COVERAGE_L) + " -o %s"; - final File outputBam = executeTestWithoutAdditionalRRTests("testCompressionWorksForHighDepth", - new WalkerTestSpec(base, 1, Arrays.asList(""))).first.get(0); // No MD5s; we only want to check the coverage - - boolean sawHighCoveragePosition = false; - final SAMFileReader reader = new SAMFileReader(outputBam); - reader.setSAMRecordFactory(new GATKSamRecordFactory()); - - for ( final SAMRecord rawRead : reader ) { - final GATKSAMRecord read = (GATKSAMRecord)rawRead; - read.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, rawRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); - - if ( ! read.isReducedRead() ) - continue; - - final int[] decodedCounts = read.getReducedReadCounts(); - for ( final int count : decodedCounts ) { - if ( count > Byte.MAX_VALUE ) { - sawHighCoveragePosition = true; - break; - } - } - - if ( sawHighCoveragePosition ) - break; - } - - reader.close(); - - Assert.assertTrue(sawHighCoveragePosition, "No positions were found with coverage over max byte (127); the coverage is incorrectly being capped somewhere!"); - } -} - diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java deleted file mode 100644 index 6032affa7..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.SAMFileHeader; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - - -public class ReduceReadsUnitTest extends BaseTest { - - Random random = new Random(987743); - Object2LongOpenHashMap hash = new Object2LongOpenHashMap(); - long nextNumber = 0L; - - /** - * Combinatorial unit test data provider example. - * - * Creates data for testMyData test function, containing two arguments, start and size at each value - * - * @return Object[][] for testng DataProvider - */ - @DataProvider(name = "ReadNameProvider") - public Object[][] readNameProvider() { - final int readNameLength = 4; - final int nReads = 100000; - final int charVariety = 20; - ObjectArrayList tests = new ObjectArrayList(); - ObjectOpenHashSet truthSet = new ObjectOpenHashSet(); - byte[] bytes = new byte[readNameLength]; - for ( int i = 0; i tests = new ObjectArrayList(); - - // test single - tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))}); - - // test multiple at one position - tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))}); - - // test multiple - tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); - - // test indel not used - tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))}); - tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))}); - - // test read clears - tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); - tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); - tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); - tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); - tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))}); - tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))}); - - return tests.toArray(new Object[][]{}); - } - - private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) { - final List x = new ArrayList(); - x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known")); - final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd())); - return new RefMetaDataTracker(Arrays.asList(rods)); - } - - @Test(dataProvider = "PopulateKnownsProvider") - public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List trackers) { - final ReduceReads rr = new ReduceReads(); - RodBinding.resetNameCounter(); - rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known")); - rr.knownSnpPositions = new ObjectAVLTreeSet(); - - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - rr.setToolkit(engine); - - for ( final RefMetaDataTracker tracker : trackers ) - rr.populateKnownSNPs(tracker); - Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear); - - rr.clearStaleKnownPositions(read); - Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear); - } - -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java deleted file mode 100644 index c49a671e2..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ /dev/null @@ -1,964 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.*; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public class SlidingWindowUnitTest extends BaseTest { - - private static final int variantRegionLength = 1000; - private static final int globalStartPosition = 1000000; - - private static boolean[] createBitset(final List locs) { - final boolean[] variantRegionBitset = new boolean[variantRegionLength]; - for ( FinishedGenomeLoc loc : locs ) { - final int stop = loc.getStop() - globalStartPosition; - for ( int i = loc.getStart() - globalStartPosition; i <= stop; i++ ) - variantRegionBitset[i] = true; - } - return variantRegionBitset; - } - - ////////////////////////////////////////////////////////////////////////////////////// - //// Test for leading softclips immediately followed by an insertion in the CIGAR //// - ////////////////////////////////////////////////////////////////////////////////////// - - @Test(enabled = true) - public void testLeadingSoftClipThenInsertion() { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 10); - read.setReadBases(Utils.dupBytes((byte) 'A', 10)); - read.setBaseQualities(Utils.dupBytes((byte)30, 10)); - read.setMappingQuality(30); - read.setCigarString("2S2I6M"); - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 1); - slidingWindow.addRead(read); - slidingWindow.close(null); - } - - @Test(enabled = true) - public void testLeadingHardClipThenInsertion() { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 8); - read.setReadBases(Utils.dupBytes((byte) 'A', 8)); - read.setBaseQualities(Utils.dupBytes((byte)30, 8)); - read.setMappingQuality(30); - read.setCigarString("2H2I6M"); - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.addRead(read); - slidingWindow.close(null); - } - - ////////////////////////////////////////////////////////////////////////////////////// - //// This section tests the findVariantRegions() method and related functionality //// - ////////////////////////////////////////////////////////////////////////////////////// - - private static final FinishedGenomeLoc loc90to95 = new FinishedGenomeLoc("1", 0, 1000090, 1000095, false); - private static final FinishedGenomeLoc loc96to99 = new FinishedGenomeLoc("1", 0, 1000096, 1000099, false); - private static final FinishedGenomeLoc loc100to110 = new FinishedGenomeLoc("1", 0, 1000100, 1000110, false); - private static final FinishedGenomeLoc loc999 = new FinishedGenomeLoc("1", 0, 1000999, 1000999, false); - - private class FindVariantRegionsTest { - public List locs, expectedResult; - public boolean[] variantRegionBitset; - - private FindVariantRegionsTest(final List locs) { - this.locs = locs; - this.expectedResult = locs; - variantRegionBitset = createBitset(locs); - } - - private FindVariantRegionsTest(final List locs, final List expectedResult) { - this.locs = locs; - this.expectedResult = expectedResult; - variantRegionBitset = createBitset(locs); - } - } - - @DataProvider(name = "findVariantRegions") - public Object[][] createFindVariantRegionsData() { - List tests = new ArrayList(); - - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc100to110))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc96to99, loc100to110), Arrays.asList(new FinishedGenomeLoc("1", 0, 1000090, 1000110, false)))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc999))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc999))}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "findVariantRegions", enabled = true) - public void testFindVariantRegions(FindVariantRegionsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, test.variantRegionBitset, true); - int index = 0; - for ( final FinishedGenomeLoc loc : locs ) { - Assert.assertTrue(loc.equals(test.expectedResult.get(index++))); - } - } - - @Test(enabled = true) - public void testNoClosingRegions() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, createBitset(Arrays.asList(loc90to95, loc999)), false); - Assert.assertEquals(locs.size(), 1); - Assert.assertEquals(locs.iterator().next(), loc90to95); - } - - - ///////////////////////////////////////////////////////////////////////////// - //// This section tests the markSites() method and related functionality //// - ///////////////////////////////////////////////////////////////////////////// - - @Test(enabled = true) - public void testMarkedSitesClass() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - final SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites(); - - markedSites.updateRegion(100, 100); - Assert.assertEquals(markedSites.getStartLocation(), 100); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - - markedSites.updateRegion(300, 100); - Assert.assertEquals(markedSites.getStartLocation(), 300); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - - markedSites.getVariantSiteBitSet()[10] = true; - markedSites.updateRegion(290, 100); - Assert.assertEquals(markedSites.getStartLocation(), 290); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - Assert.assertFalse(markedSites.getVariantSiteBitSet()[10]); - - markedSites.getVariantSiteBitSet()[20] = true; - markedSites.updateRegion(290, 100); - Assert.assertEquals(markedSites.getStartLocation(), 290); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - Assert.assertTrue(markedSites.getVariantSiteBitSet()[20]); - - markedSites.updateRegion(300, 100); - Assert.assertEquals(markedSites.getStartLocation(), 300); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - - markedSites.getVariantSiteBitSet()[95] = true; - markedSites.updateRegion(390, 20); - Assert.assertEquals(markedSites.getStartLocation(), 390); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 20); - Assert.assertTrue(markedSites.getVariantSiteBitSet()[5]); - - markedSites.updateRegion(340, 60); - Assert.assertEquals(markedSites.getStartLocation(), 340); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 60); - - markedSites.getVariantSiteBitSet()[20] = true; - markedSites.updateRegion(350, 60); - Assert.assertEquals(markedSites.getStartLocation(), 350); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 60); - Assert.assertTrue(markedSites.getVariantSiteBitSet()[10]); - } - - @Test(enabled = true) - public void testMarkVariantRegion() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100); - - slidingWindow.markVariantRegion(40); - Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21); - - slidingWindow.markVariantRegion(5); - Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37); - - slidingWindow.markVariantRegion(95); - Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52); - } - - private static int countTrueBits(final boolean[] bitset) { - int count = 0; - for ( final boolean bit : bitset ) { - if ( bit ) - count++; - } - return count; - } - - @Test(enabled = true) - public void testMarkingRegionInCancerMode() { - - final int contextSize = 10; - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, contextSize, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.addRead(createSimpleRead("1", 0, 34, 75)); - slidingWindow.addRead(createSimpleRead("2", 0, 97, 73)); - slidingWindow.addRead(createSimpleRead("3", 0, 98, 75)); - slidingWindow.addRead(createSimpleRead("4", 0, 98, 75)); - slidingWindow.addRead(createSimpleRead("5", 0, 98, 75)); - - final CompressionStash regions = new CompressionStash(); - regions.add(new FinishedGenomeLoc("1", 0, 89, 109, true)); - - slidingWindow.closeVariantRegions(regions, null, false); - Assert.assertEquals(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet().length, 76 + contextSize); - } - - private GATKSAMRecord createSimpleRead(final String name, final int refIndex, final int alignmentStart, final int length) { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length); - read.setReadBases(Utils.dupBytes((byte) 'A', length)); - read.setBaseQualities(Utils.dupBytes((byte) 30, length)); - read.setMappingQuality(60); - return read; - } - - - ///////////////////////////////////////////////////////////////// - //// This section tests the consensus creation functionality //// - ///////////////////////////////////////////////////////////////// - - private static final int readLength = 100; - private static final int testRegionSize = 1000; - private final ObjectList basicReads = new ObjectArrayList(20); - private IndexedFastaSequenceFile seq; - private SAMFileHeader header; - - @BeforeClass - public void setup() throws FileNotFoundException { - seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - - final int readFrequency = 20; - - basicReads.clear(); - for ( int i = 0; i < testRegionSize; i += readFrequency ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition + i, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(i % 40 == 20); - basicReads.add(read); - } - } - - private class ConsensusCreationTest { - public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage; - public final List myReads = new ArrayList(20); - public final String description; - - private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) { - this.expectedNumberOfReads = expectedNumberOfReads; - this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; - this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; - this.description = String.format("%d %d %d %b %b", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage, readsShouldBeLowQuality, variantBaseShouldBeLowQuality); - - // first, add the basic reads to the collection - myReads.addAll(basicReads); - - // then add the permuted reads - for ( final GenomeLoc loc : locs ) - myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality, CigarOperator.M)); - } - - private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) { - this.expectedNumberOfReads = expectedNumberOfReads; - this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; - this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; - this.description = String.format("%s %d %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage); - - // first, add the basic reads to the collection - myReads.addAll(basicReads); - - // then add the permuted reads - for ( final GenomeLoc loc : locs ) - myReads.add(createVariantRead(loc, false, false, operator)); - } - - public String toString() { return description; } - - private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality, - final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) { - - final int startPos = loc.getStart() - 50; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead" + startPos, 0, startPos, readLength); - - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - // create a mismatch if requested - if ( operator == CigarOperator.M ) - bases[50] = 'C'; - read.setReadBases(bases); - - final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength); - if ( variantBaseShouldBeLowQuality ) - baseQuals[50] = (byte)10; - read.setBaseQualities(baseQuals); - final byte mappingQual = readShouldBeLowQuality ? (byte)10 : (byte)30; - read.setMappingQuality(mappingQual); - - if ( operator != CigarOperator.M ) { - final List elements = new ArrayList(3); - elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 51, CigarOperator.M)); - elements.add(new CigarElement(1, operator)); - elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 48, CigarOperator.M)); - read.setCigar(new Cigar(elements)); - } - - return read; - } - } - - private static final GenomeLoc loc290 = new UnvalidatingGenomeLoc("1", 0, 1000290, 1000290); - private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); - private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); - private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); - private static final GenomeLoc loc320 = new UnvalidatingGenomeLoc("1", 0, 1000320, 1000320); - private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); - - private static final int DEEP_COVERAGE_ITERATIONS = 100; - - @DataProvider(name = "ConsensusCreation") - public Object[][] createConsensusCreationTestData() { - List tests = new ArrayList(); - - // test high quality reads and bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 11, 8, 7 + DEEP_COVERAGE_ITERATIONS)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc320), false, false, 13, 12, 6 + (6 * DEEP_COVERAGE_ITERATIONS))}); - - // test low quality reads - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 3, 3, 3)}); - - // test low quality bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 2, 2, 2)}); - - // test mixture - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 2, 2, 2)}); - - // test I/D operators - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ConsensusCreation", enabled = true) - public void testConsensusCreationTest(ConsensusCreationTest test) { - final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); - - // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); - - // test WITH het compression at KNOWN sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - for ( int i = 0; i < 1200; i++ ) - knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); - result = slidingWindow.close(knownSNPs); - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); - - // test WITH het compression at ALL sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - result = slidingWindow.close(null); - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); - - // test with deep coverage - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 0, ReduceReads.DownsampleStrategy.Normal, false); - for ( int i = 0; i < DEEP_COVERAGE_ITERATIONS; i++ ) { - for ( final GATKSAMRecord read : test.myReads ) { - final GATKSAMRecord copy = ArtificialSAMUtils.createArtificialRead(header, read.getReadName() + "_" + (i+1), 0, read.getAlignmentStart(), readLength); - copy.setReadBases(read.getReadBases()); - copy.setBaseQualities(read.getBaseQualities()); - copy.setMappingQuality(read.getMappingQuality()); - copy.setReadNegativeStrandFlag(read.getReadNegativeStrandFlag()); - if ( read.getCigar() != null ) - copy.setCigar(read.getCigar()); - slidingWindow.addRead(copy); - } - } - result = slidingWindow.close(null); - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsAtDeepCoverage); - } - - @Test - public void testConsensusCreationForMultiallelic() { - - final int totalNumReads = 7; - final ObjectList myReads = new ObjectArrayList(totalNumReads); - - for ( int i = 0; i < totalNumReads; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - - final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G'); - read.setReadBases(Utils.dupBytes((byte) base, readLength)); - - myReads.add(read); - } - - final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); - - // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - - // test WITH het compression at KNOWN sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - for ( int i = 0; i < readLength; i++ ) - knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); - result = slidingWindow.close(knownSNPs); - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - - // test WITH het compression at ALL sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - result = slidingWindow.close(knownSNPs); - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - } - - @Test - public void testConsensusCreationForInsertions() { - - final int totalNumReads = 7; - final ObjectList myReads = new ObjectArrayList<>(totalNumReads); - - // add reads, one with a SNP and one with a SNP and insertion - for ( int i = 0; i < totalNumReads; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - if ( i < 2 ) - bases[20] = 'C'; - if ( i == 0 ) - bases[80] = 'C'; - read.setReadBases(bases); - - if ( i == 0 ) - read.setCigarString("80M1I19M"); - - myReads.add(read); - } - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(null); - Assert.assertEquals(result.getFirst().size(), 3); // no compression at all for SNPs - } - - @Test - public void testAddingReadPairWithSameCoordinates() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); - - final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1); - read1.setReadBases(new byte[]{(byte)'A'}); - read1.setBaseQualities(new byte[]{(byte)'A'}); - read1.setMappingQuality(30); - read1.setReadNegativeStrandFlag(false); - slidingWindow.addRead(read1); - - final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1); - read2.setReadBases(new byte[]{(byte)'A'}); - read2.setBaseQualities(new byte[]{(byte)'A'}); - read2.setMappingQuality(30); - read2.setReadNegativeStrandFlag(true); - slidingWindow.addRead(read2); - - Assert.assertEquals(slidingWindow.readsInWindow.size(), 2); - } - - @Test - public void testOnlySpanningReadHasLowQual() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - - final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead1", 0, globalStartPosition, 100); - final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead2", 0, globalStartPosition + 50, 100); - - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - read1.setReadBases(bases); - read2.setReadBases(bases); - - final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength); - baseQuals[80] = (byte)10; - read1.setBaseQualities(baseQuals); - read2.setBaseQualities(baseQuals); - - read1.setMappingQuality(30); - read2.setMappingQuality(30); - - slidingWindow.addRead(read1); - slidingWindow.addRead(read2); - - Assert.assertEquals(slidingWindow.close(null).getFirst().size(), 1); - } - - - /////////////////////////////////////////////////////////// - //// This section tests the downsampling functionality //// - /////////////////////////////////////////////////////////// - - @DataProvider(name = "Downsampling") - public Object[][] createDownsamplingTestData() { - List tests = new ArrayList(); - - for ( int i = 1; i < basicReads.size() + 10; i++ ) - tests.add(new Object[]{i}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "Downsampling", enabled = true) - public void testDownsamplingTest(final int dcov) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); - final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); - - Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); - } - - @DataProvider(name = "DownsamplingFromClose") - public Object[][] createDownsamplingFromCloseTestData() { - - final ObjectList myReads = new ObjectArrayList<>(20); - for ( int i = 0; i < 21; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - if ( i < 5 ) - bases[50] = 'C'; - read.setReadBases(bases); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - myReads.add(read); - } - - List tests = new ArrayList<>(); - - for ( int i = 1; i < 25; i++ ) - tests.add(new Object[]{myReads, i}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "DownsamplingFromClose", enabled = true) - public void testDownsamplingTestFromClose(final ObjectList myReads, final int dcov) { - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); // no het compression - - Assert.assertEquals(result.getFirst().size(), Math.min(dcov, myReads.size()), "Down-sampling was not performed correctly"); - } - - @DataProvider(name = "NoDownsamplingForConsensusReads") - public Object[][] createNoDownsamplingForConsensusReadsData() { - - final ObjectList myReads = new ObjectArrayList<>(20); - for ( int i = 0; i < 30; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - if ( i < 10 ) - bases[50] = 'C'; - read.setReadBases(bases); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - read.setReadNegativeStrandFlag(i % 2 == 0); - myReads.add(read); - } - - List tests = new ArrayList<>(); - - for ( int i = 0; i < 5; i++ ) - tests.add(new Object[]{myReads, i}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "NoDownsamplingForConsensusReads", enabled = true) - public void testNoDownsamplingForConsensusReads(final ObjectList myReads, final int dcov) { - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(null); // allow het compression (so we expect 4 reads) - - Assert.assertEquals(result.getFirst().size(), 4, "Down-sampling was performed on consensus reads!"); - } - - ////////////////////////////////////////////////////////////// - //// This section tests the consensus base quals accuracy //// - ////////////////////////////////////////////////////////////// - - private class QualsTest { - public final List quals; - public final List myReads = new ArrayList(5); - - private QualsTest(final List quals) { - this.quals = quals; - for ( int i = 0; i < quals.size(); i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, 1); - read.setReadBases(new byte[]{(byte)'A'}); - read.setBaseQualities(new byte[]{quals.get(i).byteValue()}); - read.setMappingQuality(30); - myReads.add(read); - } - } - } - - @DataProvider(name = "ConsensusQuals") - public Object[][] createConsensusQualsData() { - List tests = new ArrayList(); - - final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 }; - - for ( final int qual1 : quals ) { - for ( final int qual2 : quals ) { - for ( final int qual3 : quals ) { - tests.add(new Object[]{new QualsTest(Arrays.asList(qual1, qual2, qual3))}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - private static final byte minUsableConsensusQual = 10; - - @Test(dataProvider = "ConsensusQuals", enabled = true) - public void testConsensusQualsTest(QualsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); - - Assert.assertEquals(result.getFirst().size(), 1); - final GATKSAMRecord read = result.getFirst().iterator().next(); - final int actualBaseQual = read.getReducedCount(0) * read.getBaseQualities()[0]; - final int expectedBaseQual = qualSum(test.quals); - Assert.assertEquals(actualBaseQual, expectedBaseQual); - } - - private static int qualSum(final List quals) { - int goodBases = 0; - int sum = 0; - for ( final int qual : quals ) { - if ( qual >= minUsableConsensusQual ) { - goodBases++; - sum += qual; - } - } - - // handle a low quality consensus - if ( sum == 0 ) { - for ( final int qual : quals ) { - goodBases++; - sum += qual; - } - } - - return sum - (sum % goodBases); - } - - - //////////////////////////////////////////////////// - //// This section tests the new header creation //// - //////////////////////////////////////////////////// - - @DataProvider(name = "CreateNewHeader") - public Object[][] CreateNewHeaderTestData() { - List tests = new ArrayList(); - - for ( final int start : Arrays.asList(-10, -1, 0, 1, 10) ) { - for ( final int stop : Arrays.asList(-10, -1, 0, 1, 10) ) { - tests.add(new Object[]{start, stop}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CreateNewHeader", enabled = true) - public void createNewHeaderTest(final int start, final int stop) { - - // set up the window header - final int currentHeaderStart = 100; - final int currentHeaderLength = 50; - final LinkedList windowHeader = new LinkedList(); - for ( int i = 0; i < currentHeaderLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the read - final int readStart = currentHeaderStart + start; - final int readLength = currentHeaderLength + stop - start; - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); - read.setMappingQuality(30); - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start); - - Assert.assertEquals(newIndex, start > 0 ? start : 0); - - final int expectedNewLength = currentHeaderLength + (start < 0 ? -start : 0) + (stop > 0 ? stop : 0); - Assert.assertEquals(windowHeader.size(), expectedNewLength); - } - - - //////////////////////////////////////////////////////////// - //// This section tests updating the header from a read //// - //////////////////////////////////////////////////////////// - - @DataProvider(name = "UpdateHeaderForRead") - public Object[][] UpdateHeaderForReadTestData() { - List tests = new ArrayList(); - - for ( final int start : Arrays.asList(0, 1, 10) ) { - for ( final int readLength : Arrays.asList(1, 5, 10) ) { - tests.add(new Object[]{start, readLength}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "UpdateHeaderForRead", enabled = true) - public void updateHeaderForReadTest(final int start, final int readLength) { - - // set up the window header - final int currentHeaderStart = 100; - final int currentHeaderLength = 50; - final LinkedList windowHeader = new LinkedList(); - for ( int i = 0; i < currentHeaderLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the read - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + start, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - - // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); - for ( int i = 0; i < start; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); - for ( int i = 0; i < readLength; i++ ) - Assert.assertEquals(windowHeader.get(start + i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1); - for ( int i = start + readLength; i < currentHeaderLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); - - // now remove the read - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, true, start); - for ( int i = 0; i < currentHeaderLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); - } - - @Test - public void testUpdateHeaderForReadWithHighMQ() { - - // set up the window header - final int currentHeaderStart = 100; - final LinkedList windowHeader = new LinkedList<>(); - for ( int i = 0; i < readLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the read - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(180); - read.setReadNegativeStrandFlag(false); - - // add the read and make sure it's not filtered because of low MQ (byte vs. int) - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); - for ( int i = 0; i < readLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1); - } - - ////////////////////////////////////////////////////////////////////////////////// - //// This section tests functionality related to polyploid consensus creation //// - ////////////////////////////////////////////////////////////////////////////////// - - @DataProvider(name = "MatchesKnownProvider") - public Object[][] matchesKnownProvider() { - - final ObjectArrayList tests = new ObjectArrayList(); - - // test no knowns - tests.add(new Object[]{new ObjectAVLTreeSet(), loc290.getStart(), false}); - - final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); - knownSnpPositions.add(loc290); - knownSnpPositions.add(loc295); - knownSnpPositions.add(loc310); - - // test overlap - tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true}); - tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true}); - tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true}); - tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MatchesKnownProvider") - public void testMatchesKnown(final ObjectSortedSet knownSnpPositions, final int targetLoc, final boolean expectedResult) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); - Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult); - } - - @DataProvider(name = "SignificantSoftclipsProvider") - public Object[][] SignificantSoftclipsTestData() { - List tests = new ArrayList(); - - for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) { - for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) { - tests.add(new Object[]{indexWithSoftclips, indexToSkip}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "SignificantSoftclipsProvider", enabled = true) - public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) { - - // set up the window header - final int currentHeaderStart = 100; - final int currentHeaderLength = 10; - final LinkedList windowHeader = new LinkedList(); - for ( int i = 0; i < currentHeaderLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the normal read - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength); - read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength)); - read.setMappingQuality(30); - - // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); - - // set up and add a soft-clipped read if requested - if ( indexWithSoftclips != -1 ) { - final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1); - softclippedRead.setReadBases(new byte[]{(byte) 'A'}); - softclippedRead.setBaseQualities(new byte[]{(byte) 30}); - softclippedRead.setMappingQuality(30); - softclippedRead.setCigarString("1S"); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips); - } - - final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip); - Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java deleted file mode 100644 index fd1f0de8a..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java +++ /dev/null @@ -1,162 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.collections.Pair; -import org.junit.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; - -public class BiasedDownsamplingIntegrationTest extends WalkerTest { - - private final static String baseCommandUG = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000"; - private final static String baseCommandHC = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:4,000,000-5,000,000" + " --useFilteredReadsForAnnotations"; - - private final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - - - // -------------------------------------------------------------------------------------------------------------- - // - // testing UnifiedGenotyper contamination down-sampling on BAMs with artificially created contaminated. - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - private void testDefaultContamination() { - final String bam1 = "NA11918.with.1.NA12842.reduced.bam"; - final String bam2 = "NA12842.with.1.NA11918.reduced.bam"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination .05 ", 1, - Arrays.asList("b13612312ff991cf40ddc44255e76ecd")); - executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with .05 downsampling.", spec); - } - - - // verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level - - - @DataProvider(name="PerSampleEqualFlatContamBams") - public Object[][] makePerSampleEqualFlatContamBams() { - final List tests = new LinkedList(); - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0}) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ; - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "PerSampleEqualFlatContamBams") - private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling) { - final String command = baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; - - WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList("")); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - Pair, List> test1 = executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); - - spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList("")); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - Pair, List> test2 = executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); - - //verify that the md5s match up. - Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0)); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing HaplotypeCaller Contamination Removal - // - // -------------------------------------------------------------------------------------------------------------- - - - - @DataProvider(name="PerSampleEqualFlatContamBamsHC") - public Object[][] makePerSampleEqualFlatContamBamsHC() { - final List tests = new LinkedList(); - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0 }) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ; - - return tests.toArray(new Object[][]{}); - } - - - @Test(dataProvider = "PerSampleEqualFlatContamBamsHC") - private void testPerSampleEqualsFlatHC(final String bam1, final String bam2, final String persampleFile, final Double downsampling) { - final String command = baseCommandHC + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; - - WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList("")); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - - Pair, List> test1= executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); - - WalkerTestSpec spec2 = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList("")); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - Pair, List> test2=executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); - - //verify that the md5s match up. - Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0)); - - } - - - -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java deleted file mode 100644 index 460b80121..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: 4/5/12 - * Time: 11:28 AM - * To change this template use File | Settings | File Templates. - */ -public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTest { - - private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); - - @Test(enabled = true) - public void testSNP_ACS_Pools() { - executor.PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "df0e67c975ef74d593f1c704daab1705"); - } - - @Test(enabled = true) - public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "dac2d7969e109aee9ad2dad573759f58"); - } - - @Test(enabled = true) - public void testINDEL_GGA_Pools() { - executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "ceb105e3db0f2b993e3d725b0d60b6a3"); - } - - @Test(enabled = true) - public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java deleted file mode 100644 index 48f36ccc6..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ /dev/null @@ -1,73 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM; -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS; - -public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { - - private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); - - @Test(enabled = true) - public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde"); - } - - @Test(enabled = true) - public void testMT_SNP_DISCOVERY_sp4() { - executor.PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","5d55b71688a0777a7c0247c376401368"); - } - - @Test(enabled = true) - public void testMT_SNP_GGA_sp10() { - executor.PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "cf336d66a109c55f90e9ed2b3bc196c8"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java deleted file mode 100644 index 9556f9bf1..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ /dev/null @@ -1,208 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { - - private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing indel caller - // - // -------------------------------------------------------------------------------------------------------------- - // Basic indel testing with SLX data - @Test - public void testSimpleIndels() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + - " -o %s" + - " -L 1:10,000,000-10,500,000", - 1, - Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92")); - - executeTest(String.format("test indel caller in SLX"), spec); - } - - // Basic indel testing with SLX data - @Test - public void testIndelsWithLowMinAlleleCnt() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + - " -o %s" + - " -minIndelCnt 1" + - " -L 1:10,000,000-10,100,000", - 1, - Arrays.asList("0cbe889e03bab6512680ecaebd52c536")); - - executeTest(String.format("test indel caller in SLX with low min allele count"), spec); - } - - @Test - public void testMultiTechnologyIndels() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,500,000", - 1, - Arrays.asList("3d12bdb816d27bf7c9efb4c13dc2aec7")); - - executeTest(String.format("test indel calling, multiple technologies"), spec); - } - - @Test - public void testWithIndelAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("475f8148123792064130faf9f9030fec")); - executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); - } - - @Test - public void testWithIndelAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " - + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("a7e4e1bd128424d46cffdd538b220074")); - executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); - } - - @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes - public void testMultiSampleIndels1() { - // since we're going to test the MD5s with GGA only do one here - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("")); - List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + - "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("a2c8e83f37cd1e114b42af4b873f57bc")); - executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); - } - - @Test - public void testGGAwithNoEvidenceInReads() { - final String vcf = "small.indel.test.vcf"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + - "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); - executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); - } - - @Test - public void testBaseIndelQualityScores() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + - " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" + - " -o %s" + - " -L 20:10,000,000-10,100,000", - 1, - Arrays.asList("8a7966e4b67334bca6083670c5a16b67")); - - executeTest(String.format("test UG with base indel quality scores"), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing MinIndelFraction - // - // -------------------------------------------------------------------------------------------------------------- - - final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation - + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; - - @Test - public void testMinIndelFraction0() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3")); - executeTest("test minIndelFraction 0.0", spec); - } - - @Test - public void testMinIndelFraction25() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("a5b6d7b32953500d936d3dff512a6254")); - executeTest("test minIndelFraction 0.25", spec); - } - - @Test - public void testMinIndelFraction100() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); - executeTest("test minIndelFraction 1.0", spec); - } - - // No testing of MD5 here, we previously blew up due to a 0 length haplotypes, so we just need to pass - @Test - public void testHaplotype0Length() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", - 0, - Collections.emptyList()); - executeTest("testHaplotype0Length", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java deleted file mode 100644 index 2cdddd49f..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ /dev/null @@ -1,385 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.util.BlockCompressedInputStream; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -// ********************************************************************************** // -// Note that this class also serves as an integration test for the VariantAnnotator! // -// ********************************************************************************** // - -public class UnifiedGenotyperIntegrationTest extends WalkerTest { - - private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing parameters - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testMinBaseQualityScore() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7")); - executeTest("test min_base_quality_score 26", spec); - } - - @Test - public void testSLOD() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("bc8a4e4ceb46776169b47146805c882a")); - executeTest("test SLOD", spec); - } - - @Test - public void testNDA() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3")); - executeTest("test NDA", spec); - } - - @Test - public void testCompTrack() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("21185d9a7519356ba672757f5a522971")); - executeTest("test using comp track", spec); - } - - @Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine - public void testNoCmdLineHeaderStdout() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, - Collections.emptyList()); - executeTest("testNoCmdLineHeaderStdout", spec); - } - - @Test - public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d"); - } - - @Test - public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b"); - } - - @Test - public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8"); - } - - private void testOutputParameters(final String args, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + args, 1, - Arrays.asList(md5)); - executeTest(String.format("testParameter[%s]", args), spec); - } - - @Test - public void testConfidence() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("918109938ef355d759dafc3ebb47d8a5")); - executeTest("test confidence 1", spec1); - } - - @Test - public void testNoPrior() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, - Arrays.asList("7ac60bdc355d97c0939e644b58de47d7")); - executeTest("test no prior 1", spec1); - - } - @Test - public void testUserPrior() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, - Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); - executeTest("test user prior 1", spec1); - - } - - @Test - public void emitPLsAtAllSites() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, - Arrays.asList("552aced1b1ef7e4a554223f4719f9560")); - // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail - spec1.disableShadowBCF(); - - executeTest("test all site PLs 1", spec1); - - } - // -------------------------------------------------------------------------------------------------------------- - // - // testing heterozygosity - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testHeterozyosity1() { - testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" ); - } - - @Test - public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" ); - } - - private void testHeterozosity(final double arg, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 --heterozygosity " + arg, 1, - Arrays.asList(md5)); - executeTest(String.format("test heterozyosity[%s]", arg), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing compressed output - // - // -------------------------------------------------------------------------------------------------------------- - - private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207"; - - @Test - public void testCompressedOutput() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); - executeTest("test compressed output", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing parallelization - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testParallelization() { - - // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - - String md5 = "1f3fad09a63269c36e871e7ee04ebfaa"; - final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, - Arrays.asList(md5)); - executeTest("test parallelization (single thread)", spec1); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, - Arrays.asList(md5)); - executeTest("test parallelization (2 threads)", spec2); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, - Arrays.asList(md5)); - executeTest("test parallelization (4 threads)", spec3); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing calls with SLX, 454, and SOLID data - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testMultiTechnologies() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000", - 1, - Arrays.asList("150b31ba05113ca1996b548be5170d6d")); - - executeTest(String.format("test multiple technologies"), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing calls with BAQ - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testCallingWithBAQ() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000" + - " -baq CALCULATE_AS_NECESSARY", - 1, - Arrays.asList("7d0ee85cd89f4addd84c5511daaaa5c5")); - - executeTest(String.format("test calling with BAQ"), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing SnpEff - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testSnpEffAnnotationRequestedWithoutRodBinding() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + - "-A SnpEff", - 1, - UserException.class); - executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing Ns in CIGAR - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testNsInCigar() { - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - UserException.UnsupportedCigarOperatorException.class); - - executeTest("test calling on reads with Ns in CIGAR", spec); - } - - @Test(enabled = true) - public void testCompressedVCFOutputWithNT() throws Exception { - WalkerTestSpec spec = new WalkerTestSpec("-T UnifiedGenotyper -R " + b37KGReference + " -I " - + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam" - + " -o %s -L 20:10,000,000-10,100,000 -nt 4", - 1, Arrays.asList("vcf.gz"), Arrays.asList("")); - final File vcf = executeTest("testCompressedVCFOutputWithNT", spec).first.get(0); - final AsciiLineReader reader = new AsciiLineReader(new BlockCompressedInputStream(vcf)); - int nLines = 0; - while ( reader.readLine() != null ) - nLines++; - Assert.assertTrue(nLines > 0); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing only emit samples - // - // -------------------------------------------------------------------------------------------------------------- - - @Test(enabled = true) - public void testOnlyEmitSample() throws Exception { - final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I " - + privateTestDir + "AFR.complex.variants.bam --disableDithering" - + " -o %s -L 20:10,000,000-10,100,000"; - final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList("")); - specAllSamples.disableShadowBCF(); - final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0); - final List allSampleVCs = GATKVCFUtils.readVCF(allSamplesVCF).getSecond(); - - final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList("")); - onlyHG01879.disableShadowBCF(); - final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0); - final List onlyHG01879VCs = GATKVCFUtils.readVCF(onlyHG01879VCF).getSecond(); - - Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size()); - for ( int i = 0; i < allSampleVCs.size(); i++ ) { - final VariantContext allSampleVC = allSampleVCs.get(i); - final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i); - - if ( allSampleVC == null ) { - Assert.assertNull(onlyHG01879VC); - } else { - Assert.assertNotNull(onlyHG01879VC); - - Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't"); - Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't"); - Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true); - - Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart()); - Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr()); - Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd()); - Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters()); - Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles()); - Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes()); - Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual()); - - final Genotype allG = allSampleVC.getGenotype("HG01879"); - final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879"); - Assert.assertEquals(allG.getAD(), onlyG.getAD()); - Assert.assertEquals(allG.getDP(), onlyG.getDP()); - Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles()); - Assert.assertEquals(allG.getPL(), onlyG.getPL()); - Assert.assertEquals(allG.toString(), onlyG.toString()); - } - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java deleted file mode 100644 index 18554e157..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ - - private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing normal calling - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testMultiSamplePilot1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("ec0977e3fd3e2ac29c9821f0ca830455")); - executeTest("test MultiSample Pilot1", spec); - } - - @Test - public void testWithAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("ebfcc3dd8c1788929cb50050c5d456df")); - executeTest("test MultiSample Pilot2 with alleles passed in", spec1); - } - - @Test - public void testWithAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("3e646003c5b93da80c7d8e5d0ff2ee4e")); - executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); - } - - @Test - public void testSingleSamplePilot2() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("02b521fe88a6606a29c12c0885c3debd")); - executeTest("test SingleSample Pilot2", spec); - } - - @Test - public void testMultipleSNPAlleles() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("dd5ad3beaa75319bb2ef1434d2dd9f73")); - executeTest("test Multiple SNP alleles", spec); - } - - @Test - public void testBadRead() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("d915535c1458733f09f82670092fcab6")); - executeTest("test bad read", spec); - } - - @Test - public void testReverseTrim() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("a973298b2801b80057bea88507e2858d")); - executeTest("test reverse trim", spec); - } - - @Test - public void testMismatchedPLs() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("8d91d98c4e79897690d3c6918b6ac761")); - executeTest("test mismatched PLs", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java deleted file mode 100644 index 3b5690046..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { - - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f")); - executeTest("test calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamSNPs() { - testReducedCalling("SNP", "e8de8c523751ad2fa2ee20185ba5dea7"); - } - - @Test - public void testReducedBamINDELs() { - testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6"); - } - - - private void testReducedCalling(final String model, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, - Arrays.asList(md5)); - executeTest("test calling on a ReducedRead BAM with " + model, spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java deleted file mode 100644 index 550153be0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ /dev/null @@ -1,222 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - - -// SEE private/R/pls.R if you want the truth output for these tests -public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { - @DataProvider(name = "TestCombineGLs") - public Object[][] makeTestCombineGLs() { - List tests = new ArrayList(); - - tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)}); - tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)}); - tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)}); - - // AA AB BB AC BC CC => AA AB+BC CC - tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)}); - tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)}); - - tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); - tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); - - tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)}); - tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)}); - - tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); - tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)}); - - return tests.toArray(new Object[][]{}); - } - - private Genotype makePL(final int ... PLs) { - return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); - } - - @Test(enabled = true, dataProvider = "TestCombineGLs") - private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { - final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final Genotype combined = calc.combineGLs(testg, altIndex, nAlts); - - Assert.assertEquals(combined.getPL(), expected.getPL(), - "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); - } - - - static Allele A = Allele.create("A", true); - static Allele C = Allele.create("C"); - static Allele G = Allele.create("G"); - - @DataProvider(name = "TestMakeAlleleConditionalContexts") - public Object[][] makeTestMakeAlleleConditionalContexts() { - List tests = new ArrayList(); - - final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A)); - final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C)); - final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G)); - final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G)); - final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C)); - - final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); - final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); - final Genotype gACcombined = makePL(0, 2, 5); - final Genotype gACcombined2 = makePL(0, 1, 4); - final Genotype gAGcombined = makePL(0, 4, 9); - - // biallelic - tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); - - // tri-allelic - tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())}); - tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())}); - - return tests.toArray(new Object[][]{}); - } - - - @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts") - private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { - final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); - - Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size()); - - for ( int i = 0; i < biAllelicVCs.size(); i++ ) { - final VariantContext actual = biAllelicVCs.get(i); - final VariantContext expected = expectedVCs.get(i); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); - - for ( int j = 0; j < actual.getNSamples(); j++ ) - Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(), - "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL())); - } - } - - - @DataProvider(name = "ThetaNTests") - public Object[][] makeThetaNTests() { - List tests = new ArrayList(); - - final List log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0); - - for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) { - for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) { - for ( List permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) { - tests.add(new Object[]{permutations, Math.pow(10, log10pRef)}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ThetaNTests") - public void testThetaNTests(final List log10LAlleles, final double pRef) { - // biallelic - final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef}); - - final double log10pNonRef = Math.log10(1-pRef); - - final List originalPriors = new LinkedList(); - final List pNonRefN = new LinkedList(); - for ( int i = 0; i < log10LAlleles.size(); i++ ) { - final double log10LAllele1 = log10LAlleles.get(i); - final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); - final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); - originalPriors.add(result1); - pNonRefN.add(log10pNonRef*(i+1)); - } - - final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2); - final List thetaNPriors = calc.applyMultiAllelicPriors(originalPriors); - - double prevPosterior = 0.0; - for ( int i = 0; i < log10LAlleles.size(); i++ ) { - final AFCalcResult thetaN = thetaNPriors.get(i); - AFCalcResult orig = null; - for ( final AFCalcResult x : originalPriors ) - if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping())) - orig = x; - - Assert.assertNotNull(orig, "couldn't find original AFCalc"); - - Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6); - Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6); - - Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0()); - prevPosterior = orig.getLog10PosteriorOfAFGT0(); - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java deleted file mode 100644 index 564a475b0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java +++ /dev/null @@ -1,249 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.RandomDNA; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Tests for {@link AssemblyResultSet}. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class AssemblyResultSetUnitTest extends BaseTest -{ - private GenomeLocParser genomeLocParser; - private SAMFileHeader header; - - @BeforeClass - public void init() { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - - @Test - public void testEmptyResultSet() { - final AssemblyResultSet subject = new AssemblyResultSet(); - - Assert.assertEquals(subject.getHaplotypeList().size(), 0); - Assert.assertEquals(subject.getHaplotypeCount(),0); - Assert.assertEquals(subject.getReferenceHaplotype(),null); - Assert.assertEquals(subject.getFullReferenceWithPadding(),null); - Assert.assertEquals(subject.getPaddedReferenceLoc(),null); - Assert.assertEquals(subject.getRegionForGenotyping(),null); - Assert.assertEquals(subject.getUniqueReadThreadingGraph(10),null); - Assert.assertFalse(subject.hasMultipleKmerSizes()); - } - - @Test - public void testAddReferenceHaplotype() { - - final Haplotype ref = new Haplotype("ACGT".getBytes(),true); - ref.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,ref.length() + 1 )); - final AssemblyResultSet subject = new AssemblyResultSet(); - - Assert.assertTrue(subject.add(ref)); - Assert.assertFalse(subject.add(ref)); - - Assert.assertEquals(subject.getReferenceHaplotype(),ref); - Assert.assertEquals(subject.getHaplotypeCount(),1); - Assert.assertEquals(subject.getHaplotypeList().size(),1); - } - - @Test(dataProvider="assemblyResults") - public void testAddManyHaplotypes(final java.util.List assemblyResults, - final java.util.List> haplotypes) { - final AssemblyResultSet subject = new AssemblyResultSet(); - for (int i = 0; i < haplotypes.size(); i++) { - final int haplotypeCountBefore = subject.getHaplotypeCount(); - final java.util.List haplos = haplotypes.get(i); - final AssemblyResult ar = assemblyResults.get(i); - for (final Haplotype h : haplos) { - Assert.assertTrue(subject.add(h, ar)); - Assert.assertFalse(subject.add(h,ar)); - if (h.isReference()) - Assert.assertEquals(subject.getReferenceHaplotype(),h); - } - final int haplotypeCountAfter = subject.getHaplotypeCount(); - Assert.assertEquals(haplos.size(),haplotypeCountAfter - haplotypeCountBefore); - Assert.assertTrue(subject.getMaximumKmerSize() >= ar.getKmerSize()); - Assert.assertTrue(subject.getMinimumKmerSize() <= ar.getKmerSize()); - Assert.assertEquals(subject.getUniqueReadThreadingGraph(ar.getKmerSize()), ar.getThreadingGraph()); - } - } - - @Test(dataProvider="trimmingData") - public void testTrimTo(final Map haplotypesAndResultSets, final ActiveRegion original) { - final AssemblyResultSet subject = new AssemblyResultSet(); - for (final Map.Entry entry : haplotypesAndResultSets.entrySet()) - subject.add(entry.getKey(),entry.getValue()); - subject.setRegionForGenotyping(original); - final GenomeLoc originalLocation = original.getExtendedLoc(); - final int length = originalLocation.size(); - final GenomeLoc newLocation = originalLocation.setStop(originalLocation.setStart(originalLocation,originalLocation.getStart() + length / 2),originalLocation.getStop() - length / 2); - final ActiveRegion newRegion = original.trim(newLocation); - - final Map originalHaplotypesByTrimmed = new HashMap<>(haplotypesAndResultSets.size()); - for (final Haplotype h : haplotypesAndResultSets.keySet()) - originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h); - - final AssemblyResultSet trimmed = subject.trimTo(newRegion, originalHaplotypesByTrimmed); - - Assert.assertFalse(subject.wasTrimmed()); - Assert.assertTrue(trimmed.wasTrimmed()); - - for (final Haplotype h : trimmed.getHaplotypeList()) { - Assert.assertEquals(h.getGenomeLocation(),newLocation); - Assert.assertEquals(h.getBases().length,newLocation.size()); - } - } - - @DataProvider(name="trimmingData") - public Iterator trimmingData() { - final ActiveRegion activeRegion = new ActiveRegion(genomeLocParser.createGenomeLoc("chr1",1000,1100),genomeLocParser,25); - final int length = activeRegion.getExtendedLoc().size(); - final RandomDNA rnd = new RandomDNA(13); // keep it prepoducible by fixing the seed to lucky 13. - final ActiveRegionTestDataSet actd = new ActiveRegionTestDataSet(10,new String(rnd.nextBases(length)),new String[] { - "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]); - - final List haplotypes = actd.haplotypeList(); - for (final Haplotype h : haplotypes) - h.setGenomeLocation(activeRegion.getExtendedLoc()); - - final ReadThreadingGraph rtg = new ReadThreadingGraph(10); - for (final Haplotype h : haplotypes) - rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference()); - final SeqGraph seqGraph = rtg.convertToSequenceGraph(); - final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph); - ar.setThreadingGraph(rtg); - final Map result = - new HashMap<>(); - for (final Haplotype h : haplotypes) - result.put(h,ar); - return Collections.singleton(new Object[] {result,activeRegion}).iterator(); - - } - - - - - @DataProvider(name="assemblyResults") - public java.util.Iterator assemblyResults() { - final int size = THREE_KS_GRAPH_AND_HAPLOTYPES.length * (1 + TEN_KS_GRAPH_AND_HAPLOTYPES.length); - final Object[][] result = new Object[size][]; - - for (int i = 0; i < THREE_KS_GRAPH_AND_HAPLOTYPES.length; i++) { - final ReadThreadingGraph rtg = new ReadThreadingGraph((String) THREE_KS_GRAPH_AND_HAPLOTYPES[i][0]); - final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg.convertToSequenceGraph()); - ar.setThreadingGraph(rtg); - final Object[] haplotypeStrings = (Object[]) THREE_KS_GRAPH_AND_HAPLOTYPES[i][1]; - final Haplotype[] haplotypes = new Haplotype[haplotypeStrings.length]; - for (int j = 0; j < haplotypeStrings.length; j++) { - haplotypes[j] = new Haplotype(((String)haplotypeStrings[j]).getBytes(),j == 0); - haplotypes[j].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,haplotypes[j].length() + 1)); - } - result[i] = new Object[] { Collections.singletonList(ar),Arrays.asList(Arrays.asList(haplotypes))}; - for (int j = 0; j < TEN_KS_GRAPH_AND_HAPLOTYPES.length; j++) { - final ReadThreadingGraph rtg10 = new ReadThreadingGraph((String) TEN_KS_GRAPH_AND_HAPLOTYPES[j][0]); - final AssemblyResult ar10 = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg10.convertToSequenceGraph()); - ar10.setThreadingGraph(rtg10); - final Object[] haplotypeStrings10 = (Object[]) TEN_KS_GRAPH_AND_HAPLOTYPES[j][1]; - final Haplotype[] haplotype10 = new Haplotype[haplotypeStrings10.length]; - for (int k = 0; k < haplotypeStrings10.length; k++) { - haplotype10[k] = new Haplotype(((String)haplotypeStrings10[k]).getBytes(),false); - haplotype10[k].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1", 1, haplotype10[k].length() + 1)); - } - - result[THREE_KS_GRAPH_AND_HAPLOTYPES.length + i * TEN_KS_GRAPH_AND_HAPLOTYPES.length + j] = new Object[] { Arrays.asList(ar,ar10), - Arrays.asList( Arrays.asList(haplotypes), Arrays.asList(haplotype10)) }; - } - } - return Arrays.asList(result).iterator(); - } - - - private static final Object[][] THREE_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { - {"[ks=3]{REF: ACT}",new Object[] {"ACT"}}, - {"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + - "{ (3) -> A -> G -> (2) }" + - "{ (1) -> A -> G -> (2) }",new Object[] {"ACTTGA","ACTAGGA","ACTTAGGA"}}, - {"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}", new Object[] {"ACTCG"}} , - {"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + - "{A(1) -> T -> A(2) }", new Object[] {"ACTAGACGT","ACTATACGT"}} , - {"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" + - "{ ACT -> A -> T(2) -> C -> T -> A -> C -> G -> T -> A(1) -> T}", - new Object[] {"ACTATCAGTACGTAT","ACTATCTACGTAT"}} , - {"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}", - new Object[] {"ACTATCAGTACGTAT"}}, - {"[ks=3]{REF: ACT -> A -> T(1) }" + - "{ ACT -> A -> T(1) }", new Object[] {"ACTAT"}}, - {"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ", new Object[] {"TTTACT","TTTAT"}} - }; - - private static final Object[][] TEN_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { - {"[ks=10]{ACTAGTAAAT -> A -> T -> A -> A -> T -> A", new Object[] {"ACTAGTAAATATAATA"}}, - {"[ks=10]{ATAGTAATAA(1) -> A -> C -> T -> A(2) -> C}{ (1) -> C -> C -> C -> A(2) -> C}", - new Object[] {"ATAGTAATAAACTAC","ATAGTAATAACCCAC"}}, - - }; - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java deleted file mode 100644 index 8633a1d9d..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ /dev/null @@ -1,305 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 3/15/12 - */ - -import net.sf.picard.reference.ReferenceSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * Unit tests for GenotypingEngine - */ -public class GenotypingEngineUnitTest extends BaseTest { - - private static ReferenceSequenceFile seq; - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() throws FileNotFoundException { - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - genomeLocParser = new GenomeLocParser(seq); - } - - @Test - public void testFindHomVarEventAllelesInSample() { - final List eventAlleles = new ArrayList(); - eventAlleles.add( Allele.create("A", true) ); - eventAlleles.add( Allele.create("C", false) ); - final List haplotypeAlleles = new ArrayList(); - haplotypeAlleles.add( Allele.create("AATA", true) ); - haplotypeAlleles.add( Allele.create("AACA", false) ); - haplotypeAlleles.add( Allele.create("CATA", false) ); - haplotypeAlleles.add( Allele.create("CACA", false) ); - final List haplotypes = new ArrayList(); - haplotypes.add(new Haplotype("AATA".getBytes())); - haplotypes.add(new Haplotype("AACA".getBytes())); - haplotypes.add(new Haplotype("CATA".getBytes())); - haplotypes.add(new Haplotype("CACA".getBytes())); - final List haplotypeAllelesForSample = new ArrayList(); - haplotypeAllelesForSample.add( Allele.create("CATA", false) ); - haplotypeAllelesForSample.add( Allele.create("CACA", false) ); - final List> alleleMapper = new ArrayList>(); - List Aallele = new ArrayList(); - Aallele.add(haplotypes.get(0)); - Aallele.add(haplotypes.get(1)); - List Callele = new ArrayList(); - Callele.add(haplotypes.get(2)); - Callele.add(haplotypes.get(3)); - alleleMapper.add(Aallele); - alleleMapper.add(Callele); - final List eventAllelesForSample = new ArrayList(); - eventAllelesForSample.add( Allele.create("C", false) ); - eventAllelesForSample.add( Allele.create("C", false) ); - - if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { - logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); - logger.warn("expected alleles = " + eventAllelesForSample); - } - Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); - } - - @Test - public void testFindHetEventAllelesInSample() { - final List eventAlleles = new ArrayList(); - eventAlleles.add( Allele.create("A", true) ); - eventAlleles.add( Allele.create("C", false) ); - eventAlleles.add( Allele.create("T", false) ); - final List haplotypeAlleles = new ArrayList(); - haplotypeAlleles.add( Allele.create("AATA", true) ); - haplotypeAlleles.add( Allele.create("AACA", false) ); - haplotypeAlleles.add( Allele.create("CATA", false) ); - haplotypeAlleles.add( Allele.create("CACA", false) ); - haplotypeAlleles.add( Allele.create("TACA", false) ); - haplotypeAlleles.add( Allele.create("TTCA", false) ); - haplotypeAlleles.add( Allele.create("TTTA", false) ); - final List haplotypes = new ArrayList(); - haplotypes.add(new Haplotype("AATA".getBytes())); - haplotypes.add(new Haplotype("AACA".getBytes())); - haplotypes.add(new Haplotype("CATA".getBytes())); - haplotypes.add(new Haplotype("CACA".getBytes())); - haplotypes.add(new Haplotype("TACA".getBytes())); - haplotypes.add(new Haplotype("TTCA".getBytes())); - haplotypes.add(new Haplotype("TTTA".getBytes())); - final List haplotypeAllelesForSample = new ArrayList(); - haplotypeAllelesForSample.add( Allele.create("TTTA", false) ); - haplotypeAllelesForSample.add( Allele.create("AATA", true) ); - final List> alleleMapper = new ArrayList>(); - List Aallele = new ArrayList(); - Aallele.add(haplotypes.get(0)); - Aallele.add(haplotypes.get(1)); - List Callele = new ArrayList(); - Callele.add(haplotypes.get(2)); - Callele.add(haplotypes.get(3)); - List Tallele = new ArrayList(); - Tallele.add(haplotypes.get(4)); - Tallele.add(haplotypes.get(5)); - Tallele.add(haplotypes.get(6)); - alleleMapper.add(Aallele); - alleleMapper.add(Callele); - alleleMapper.add(Tallele); - final List eventAllelesForSample = new ArrayList(); - eventAllelesForSample.add( Allele.create("A", true) ); - eventAllelesForSample.add( Allele.create("T", false) ); - - if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { - logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); - logger.warn("expected alleles = " + eventAllelesForSample); - } - Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); - } - - private boolean compareAlleleLists(List l1, List l2) { - if( l1.size() != l2.size() ) { - return false; // sanity check - } - - for( int i=0; i < l1.size(); i++ ){ - if ( !l2.contains(l1.get(i)) ) - return false; - } - return true; - } - - - private class BasicGenotypingTestProvider extends TestDataProvider { - byte[] ref; - byte[] hap; - Map expected; - - public BasicGenotypingTestProvider(String refString, String hapString, Map expected) { - super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString)); - ref = refString.getBytes(); - hap = hapString.getBytes(); - this.expected = expected; - } - - public Map calcAlignment() { - final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); - final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); - return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); - } - } - - @DataProvider(name = "BasicGenotypingTestProvider") - public Object[][] makeBasicGenotypingTests() { - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(2 + contextSize, (byte)'M'); - map.put(21 + contextSize, (byte)'M'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG", "ATCTCGCATCGCGAGCATCGCCTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'I'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACACTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'D'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); - } - - for( int contextSize : new int[]{1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1, (byte)'M'); - map.put(20, (byte)'D'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider("AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(2 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'I'); - map.put(30 + contextSize, (byte)'D'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "ACCTCGCATCGCGAGCATCGTTACTAGCCGATG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'D'); - map.put(28 + contextSize, (byte)'M'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCCATAG", map); - } - - return BasicGenotypingTestProvider.getTests(BasicGenotypingTestProvider.class); - } - - @Test(dataProvider = "BasicGenotypingTestProvider", enabled = true) - public void testHaplotypeToVCF(BasicGenotypingTestProvider cfg) { - Map calculatedMap = cfg.calcAlignment(); - Map expectedMap = cfg.expected; - logger.warn(String.format("Test: %s", cfg.toString())); - if(!compareVCMaps(calculatedMap, expectedMap)) { - logger.warn("calc map = " + calculatedMap); - logger.warn("expected map = " + expectedMap); - } - Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap)); - } - - /** - * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext - */ - private boolean compareVCMaps(Map calc, Map expected) { - if( !calc.keySet().equals(expected.keySet()) ) { return false; } // sanity check - for( Integer loc : expected.keySet() ) { - Byte type = expected.get(loc); - switch( type ) { - case 'I': - if( !calc.get(loc).isSimpleInsertion() ) { return false; } - break; - case 'D': - if( !calc.get(loc).isSimpleDeletion() ) { return false; } - break; - case 'M': - if( !(calc.get(loc).isMNP() || calc.get(loc).isSNP()) ) { return false; } - break; - default: - return false; - } - } - return true; - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java deleted file mode 100644 index 3907ffbd6..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ /dev/null @@ -1,99 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM; -import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF; - -public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { - - private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88c10027c21712b1fe475c06cadd503c"); - } - - private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); - } - - // TODO -- need a better symbolic allele test - @Test - public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554"); - } - - private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSampleGGAComplex() { - HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "b787be740423b950f8529ccc838fabdd"); - } - - @Test - public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { - HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "f74d68cbc1ecb66a7128258e111cd030"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java deleted file mode 100644 index 97744f126..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { - @DataProvider(name = "MyDataProvider") - public Object[][] makeMyDataProvider() { - List tests = new ArrayList<>(); - - final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; - final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"}); - - - return tests.toArray(new Object[][]{}); - } - - /** - * Example testng test using MyDataProvider - */ - @Test(dataProvider = "MyDataProvider") - public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { - final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); - executeTest(name, spec); - } - - @Test - public void testERCRegionWithNoCalledHaplotypes() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); - spec.disableShadowBCF(); - executeTest("testERCRegionWithNoCalledHaplotypes", spec); - } - - @Test() - public void testMissingGVCFIndexException() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); - spec.disableShadowBCF(); - executeTest("testMissingGVCFIndexingStrategyException", spec); - } - - @Test() - public void testWrongParameterGVCFIndexException() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); - spec.disableShadowBCF(); - executeTest("testMissingGVCFIndexingStrategyException", spec); - } - - @Test() - public void testWrongTypeGVCFIndexException() { - // ensure non-optimal, if optimal changes - GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK; - if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) - type = GATKVCFIndexType.DYNAMIC_SIZE; - - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); - spec.disableShadowBCF(); - executeTest("testMissingGVCFIndexingStrategyException", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java deleted file mode 100644 index c27208194..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ /dev/null @@ -1,280 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; - -public class HaplotypeCallerIntegrationTest extends WalkerTest { - final static String REF = b37KGReference; - final static String NA12878_BAM = privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; - final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; - final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; - final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; - final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; - - private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCaller: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b"); - } - - @Test - public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295"); - } - - @Test - public void testHaplotypeCallerGraphBasedSingleSample() { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "213df0bdaa78a695e9336128333e4407"); - } - - @Test - public void testHaplotypeCallerGraphBasedMultiSample() { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ceee711cac50b4bb66a084acb9264941"); - } - - @Test(enabled = false) // can't annotate the rsID's yet - public void testHaplotypeCallerSingleSampleWithDbsnp() { - HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); - } - - @Test - public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "b09437f11db40abd49195110e50692c2"); - } - - @Test - public void testHaplotypeCallerInsertionOnEdgeOfContig() { - HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); - } - - private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e"); - } - - private void HCTestNearbySmallIntervals(String bam, String args, String md5) { - try { - final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); - final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { - if( containsDuplicateRecord(vcf, parser) ) { - throw new IllegalStateException("Duplicate records detected but there should be none."); - } - } - } catch( FileNotFoundException e ) { - throw new IllegalStateException("Could not find the b37 reference file."); - } - } - - private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { - final List> VCs = new ArrayList<>(); - try { - for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { - VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); - } - } catch( IOException e ) { - throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); - } - - final Set> VCsAsSet = new HashSet<>(VCs); - return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. - } - - - @Test - public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2"); - } - - // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper - // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to - // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust - // any of the calls in that region because it is so messy. - @Test - public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a")); - executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); - } - - @Test - public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); - executeTest("HCTestStructuralIndels: ", spec); - } - - @Test - public void HCTestDoesNotFailOnBadRefBase() { - // don't care about the output - just want to make sure it doesn't fail - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); - executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); - } - - @Test - public void HCTestDanglingTailMergingForDeletions() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); - - // confirm that the call is the correct one - final VCFCodec codec = new VCFCodec(); - final FileInputStream s = new FileInputStream(outputVCF); - final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); - codec.readHeader(lineIterator); - final String line = lineIterator.next(); - Assert.assertFalse(line == null); - final VariantContext vc = codec.decode(line); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertTrue(vc.getReference().basesMatch("ATGTATG")); - Assert.assertTrue(vc.getAlternateAllele(0).basesMatch("A")); - } - - - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e")); - executeTest("HC calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamWithReadsNotFullySpanningDeletion() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("6a9222905c740b9208bf3c67478514eb")); - executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // test dbSNP annotation - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestDBSNPAnnotationWGS() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("a43d6226a51eb525f0774f88e3778189")); - executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); - } - - @Test - public void HCTestDBSNPAnnotationWEx() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 - + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("1352cbe1404aefc94eb8e044539a9882")); - executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // test PCR indel model - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestAggressivePcrIndelModelWGS() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("19c2992541ede7407192660fdc1fadbf")); - executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); - } - - @Test - public void HCTestConservativePcrIndelModelWGS() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16")); - executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java deleted file mode 100644 index 21648b2b9..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { - @DataProvider(name = "NCTDataProvider") - public Object[][] makeNCTDataProvider() { - List tests = new ArrayList<>(); - - for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "NCTDataProvider") - public void testHCNCT(final int nct, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " - + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + - " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, - Arrays.asList(md5)); - executeTest("HC test parallel HC with NCT with nct " + nct, spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java deleted file mode 100644 index 1b63f2971..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java +++ /dev/null @@ -1,591 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 3/14/12 - */ - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; -import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Unit tests for PairHMMLikelihoodCalculationEngine - */ -public class PairHMMLikelihoodCalculationEngineUnitTest extends BaseTest { - - Allele Aref, T, C, G, Cref, ATC, ATCATC; - - @BeforeSuite - public void setup() { - // alleles - Aref = Allele.create("A", true); - Cref = Allele.create("C", true); - T = Allele.create("T"); - C = Allele.create("C"); - G = Allele.create("G"); - ATC = Allele.create("ATC"); - ATCATC = Allele.create("ATCATC"); - } - - @Test - public void testNormalizeDiploidLikelihoodMatrixFromLog10() { - double[][] likelihoodMatrix = { - {-90.2, 0, 0}, - {-190.1, -2.1, 0}, - {-7.0, -17.5, -35.9} - }; - double[][] normalizedMatrix = { - {-88.1, 0, 0}, - {-188.0, 0.0, 0}, - {-4.9, -15.4, -33.8} - }; - - - Assert.assertTrue(compareDoubleArrays(PairHMMLikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix)); - - double[][] likelihoodMatrix2 = { - {-90.2, 0, 0, 0}, - {-190.1, -2.1, 0, 0}, - {-7.0, -17.5, -35.9, 0}, - {-7.0, -17.5, -35.9, -1000.0}, - }; - double[][] normalizedMatrix2 = { - {-88.1, 0, 0, 0}, - {-188.0, 0.0, 0, 0}, - {-4.9, -15.4, -33.8, 0}, - {-4.9, -15.4, -33.8, -997.9}, - }; - Assert.assertTrue(compareDoubleArrays(PairHMMLikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); - } - - @DataProvider(name = "PcrErrorModelTestProvider") - public Object[][] createPcrErrorModelTestData() { - List tests = new ArrayList(); - - for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) { - for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) { - tests.add(new Object[]{repeat, repeatLength}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "PcrErrorModelTestProvider", enabled = true) - public void createPcrErrorModelTest(final String repeat, final int repeatLength) { - - final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte)0, false, - PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, - PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); - - final String readString = Utils.dupString(repeat, repeatLength); - final byte[] insQuals = new byte[readString.length()]; - final byte[] delQuals = new byte[readString.length()]; - Arrays.fill(insQuals, (byte)PairHMMLikelihoodCalculationEngine.INITIAL_QSCORE); - Arrays.fill(delQuals, (byte)PairHMMLikelihoodCalculationEngine.INITIAL_QSCORE); - - engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals); - - final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); - repeatCovariate.initialize(PairHMMLikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, PairHMMLikelihoodCalculationEngine.MAX_REPEAT_LENGTH); - - for ( int i = 1; i < insQuals.length; i++ ) { - - final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond(); - final byte adjustedScore = PairHMMLikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0); - - Assert.assertEquals(insQuals[i-1], adjustedScore); - Assert.assertEquals(delQuals[i-1], adjustedScore); - } - } - - /* - private class BasicLikelihoodTestProvider extends TestDataProvider { - public Double readLikelihoodForHaplotype1; - public Double readLikelihoodForHaplotype2; - public Double readLikelihoodForHaplotype3; - - public BasicLikelihoodTestProvider(double a, double b) { - super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f",a,b)); - readLikelihoodForHaplotype1 = a; - readLikelihoodForHaplotype2 = b; - readLikelihoodForHaplotype3 = null; - } - - public BasicLikelihoodTestProvider(double a, double b, double c) { - super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f / %f",a,b,c)); - readLikelihoodForHaplotype1 = a; - readLikelihoodForHaplotype2 = b; - readLikelihoodForHaplotype3 = c; - } - - public double[][] expectedDiploidHaplotypeMatrix() { - if( readLikelihoodForHaplotype3 == null ) { - double maxValue = Math.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2); - double[][] normalizedMatrix = { - {readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY}, - {Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue} - }; - return normalizedMatrix; - } else { - double maxValue = MathUtils.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2,readLikelihoodForHaplotype3); - double[][] normalizedMatrix = { - {readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY}, - {Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue, Double.NEGATIVE_INFINITY}, - {Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, - Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype2) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, readLikelihoodForHaplotype3 - maxValue} - }; - return normalizedMatrix; - } - } - - public double[][] calcDiploidHaplotypeMatrix() { - ArrayList haplotypes = new ArrayList(); - for( int iii = 1; iii <= 3; iii++) { - Double readLikelihood = ( iii == 1 ? readLikelihoodForHaplotype1 : ( iii == 2 ? readLikelihoodForHaplotype2 : readLikelihoodForHaplotype3) ); - int readCount = 1; - if( readLikelihood != null ) { - Haplotype haplotype = new Haplotype( (iii == 1 ? "AAAA" : (iii == 2 ? "CCCC" : "TTTT")).getBytes() ); - haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood}, new int[]{readCount}); - haplotypes.add(haplotype); - } - } - final HashSet sampleSet = new HashSet(1); - sampleSet.add("myTestSample"); - return PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); - } - } - - @DataProvider(name = "BasicLikelihoodTestProvider") - public Object[][] makeBasicLikelihoodTests() { - new BasicLikelihoodTestProvider(-1.1, -2.2); - new BasicLikelihoodTestProvider(-2.2, -1.1); - new BasicLikelihoodTestProvider(-1.1, -1.1); - new BasicLikelihoodTestProvider(-9.7, -15.0); - new BasicLikelihoodTestProvider(-1.1, -2000.2); - new BasicLikelihoodTestProvider(-1000.1, -2.2); - new BasicLikelihoodTestProvider(0, 0); - new BasicLikelihoodTestProvider(-1.1, 0); - new BasicLikelihoodTestProvider(0, -2.2); - new BasicLikelihoodTestProvider(-100.1, -200.2); - - new BasicLikelihoodTestProvider(-1.1, -2.2, 0); - new BasicLikelihoodTestProvider(-2.2, -1.1, 0); - new BasicLikelihoodTestProvider(-1.1, -1.1, 0); - new BasicLikelihoodTestProvider(-9.7, -15.0, 0); - new BasicLikelihoodTestProvider(-1.1, -2000.2, 0); - new BasicLikelihoodTestProvider(-1000.1, -2.2, 0); - new BasicLikelihoodTestProvider(0, 0, 0); - new BasicLikelihoodTestProvider(-1.1, 0, 0); - new BasicLikelihoodTestProvider(0, -2.2, 0); - new BasicLikelihoodTestProvider(-100.1, -200.2, 0); - - new BasicLikelihoodTestProvider(-1.1, -2.2, -12.121); - new BasicLikelihoodTestProvider(-2.2, -1.1, -12.121); - new BasicLikelihoodTestProvider(-1.1, -1.1, -12.121); - new BasicLikelihoodTestProvider(-9.7, -15.0, -12.121); - new BasicLikelihoodTestProvider(-1.1, -2000.2, -12.121); - new BasicLikelihoodTestProvider(-1000.1, -2.2, -12.121); - new BasicLikelihoodTestProvider(0, 0, -12.121); - new BasicLikelihoodTestProvider(-1.1, 0, -12.121); - new BasicLikelihoodTestProvider(0, -2.2, -12.121); - new BasicLikelihoodTestProvider(-100.1, -200.2, -12.121); - - return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); - } - - @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) - public void testOneReadWithTwoOrThreeHaplotypes(BasicLikelihoodTestProvider cfg) { - double[][] calculatedMatrix = cfg.calcDiploidHaplotypeMatrix(); - double[][] expectedMatrix = cfg.expectedDiploidHaplotypeMatrix(); - logger.warn(String.format("Test: %s", cfg.toString())); - Assert.assertTrue(compareDoubleArrays(calculatedMatrix, expectedMatrix)); - } - */ - - //Private function to compare 2d arrays - private boolean compareDoubleArrays(double[][] b1, double[][] b2) { - if( b1.length != b2.length ) { - return false; // sanity check - } - - for( int i=0; i < b1.length; i++ ){ - if( b1[i].length != b2[i].length) { - return false; // sanity check - } - for( int j=0; j < b1.length; j++ ){ - if ( MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0 && !Double.isInfinite(b1[i][j]) && !Double.isInfinite(b2[i][j])) - return false; - } - } - return true; - } - - - private String arraysEq(int[] a, int[] b) { - if ( a.length != b.length ) { - return String.format("NEQ: %s | %s",Arrays.toString(a),Arrays.toString(b)); - } - for ( int idx = 0; idx < a.length; idx++) { - if ( a[idx] - b[idx] > 1 || b[idx] - a[idx] > 1) { - return String.format("NEQ: %s | %s",Arrays.toString(a),Arrays.toString(b)); - } - } - - return ""; - } - - private int[] _mleparse(List s) { - int[] mle = new int[s.size()]; - for ( int idx = 0; idx < mle.length; idx ++) { - mle[idx] = s.get(idx); - } - - return mle; - } - - private Genotype makeGwithPLs(String sample, Allele a1, Allele a2, double[] pls) { - Genotype gt = new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); - if ( pls != null && pls.length > 0 ) { - Assert.assertNotNull(gt.getPL()); - Assert.assertTrue(gt.getPL().length > 0); - for ( int i : gt.getPL() ) { - Assert.assertTrue(i >= 0); - } - Assert.assertNotEquals(Arrays.toString(gt.getPL()),"[0]"); - } - return gt; - } - - private Genotype makeG(String sample, Allele a1, Allele a2) { - return GenotypeBuilder.create(sample, Arrays.asList(a1, a2)); - } - - private Genotype makeG(String sample, Allele a1, Allele a2, int... pls) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); - } - - private VariantContext makeVC(String source, List alleles, Genotype... genotypes) { - int start = 10; - int stop = start; // alleles.contains(ATC) ? start + 3 : start; - return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(Arrays.asList(genotypes)).filters(null).make(); - } - - @Test - private void testCalculatePosteriorNoExternalData() { - VariantContext test1 = makeVC("1",Arrays.asList(Aref,T), makeG("s1",Aref,T,20,0,10), - makeG("s2",T,T,60,40,0), - makeG("s3",Aref,Aref,0,30,90)); - test1 = new VariantContextBuilder(test1).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,3).make(); - VariantContext test1result = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(test1, new ArrayList(), 0, 0.001, true, false, false); - Genotype test1exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.20686, -0.03073215, -1.20686}); - Assert.assertTrue(test1exp1.hasPL()); - Genotype test1exp2 = makeGwithPLs("s2",T,T,new double[]{-6.000066, -3.823938, -6.557894e-05}); - Genotype test1exp3 = makeGwithPLs("s3",Aref,Aref,new double[]{-0.0006510083, -2.824524, -9.000651}); - Assert.assertEquals("java.util.ArrayList",test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY).getClass().getCanonicalName()); - Assert.assertEquals(arraysEq(test1exp1.getPL(), _mleparse((List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List)test1result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List)test1result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - - // AA AB BB AC BC CC - // AA AC CC AT CT TT - VariantContext test2 = makeVC("2",Arrays.asList(Aref,C,T), - makeG("s1",Aref,T,30,10,60,0,15,90), - makeG("s2",Aref,C,40,0,10,30,40,80), - makeG("s3",Aref,Aref,0,5,8,15,20,40), - makeG("s4",C,T,80,40,12,20,0,10)); - test2 = new VariantContextBuilder(test2).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,new ArrayList(Arrays.asList(2,2))).make(); - VariantContext test2result = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(test2,new ArrayList(),5,0.001,true,false,false); - Genotype test2exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.647372, -1.045139, -6.823193, -0.04513873, -2.198182, -9.823193}); - Genotype test2exp2 = makeGwithPLs("s2",Aref,C,new double[]{-3.609957, -0.007723248, -1.785778, -3.007723, -4.660767, -8.785778}); - Genotype test2exp3 = makeGwithPLs("s3",Aref,Aref,new double[] {-0.06094877, -0.9587151, -2.03677,-1.958715, -3.111759, -5.23677}); - Genotype test2exp4 = makeGwithPLs("s4",C,T,new double[]{-7.016534, -3.4143, -1.392355, -1.4143, -0.06734388, -1.192355}); - Assert.assertEquals(arraysEq(test2exp1.getPL(),(int[]) _mleparse((List)test2result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test2exp2.getPL(),(int[]) _mleparse((List)test2result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test2exp3.getPL(),(int[]) _mleparse((List)test2result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test2exp4.getPL(),(int[]) _mleparse((List)test2result.getGenotype(3).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - } - - @Test - private void testCalculatePosteriorSamplePlusExternal() { - VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), - makeG("s2",Aref,T,18,0,24), - makeG("s3",Aref,T,22,0,12)); - List supplTest1 = new ArrayList<>(3); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,2).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); - supplTest1.add(new VariantContextBuilder(makeVC("3",Arrays.asList(Aref,T))).attribute(VCFConstants.ALLELE_COUNT_KEY,4).attribute(VCFConstants.ALLELE_NUMBER_KEY,22).make()); - supplTest1.add(makeVC("4",Arrays.asList(Aref,T), - makeG("s_1",T,T), - makeG("s_2",Aref,T))); - VariantContext test1result = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); - // the counts here are ref=30, alt=14 - Genotype test1exp1 = makeGwithPLs("t1",T,T,new double[]{-3.370985, -1.415172, -0.01721766}); - Genotype test1exp2 = makeGwithPLs("t2",Aref,T,new double[]{-1.763792, -0.007978791, -3.010024}); - Genotype test1exp3 = makeGwithPLs("t3",Aref,T,new double[]{-2.165587, -0.009773643, -1.811819}); - Assert.assertEquals(arraysEq(test1exp1.getPL(),_mleparse((List) test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List) test1result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List) test1result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - - VariantContext testNonOverlapping = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,3,1,0)); - List other = Arrays.asList(makeVC("2",Arrays.asList(Aref,C),makeG("s2",C,C,10,2,0))); - VariantContext test2result = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(testNonOverlapping,other,0,0.001,true,false,false); - Genotype test2exp1 = makeGwithPLs("SGV",T,T,new double[]{-4.078345, -3.276502, -0.0002661066}); - Assert.assertEquals(arraysEq(test2exp1.getPL(),_mleparse((List) test2result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); - } - - private double[] pl2gl(int[] pl) { - double[] gl = new double[pl.length]; - for ( int idx = 0; idx < gl.length; idx++ ) { - gl[idx] = pl[idx]/(-10.0); - } - - return MathUtils.normalizeFromLog10(gl,true); - } - - @Test - private void testCalculatePosterior() { - int[][] likelihood_PLs = new int[][]{ - new int[]{3,0,3}, - new int[]{99,0,99}, - new int[]{50,20,0}, - new int[]{10,0,50}, - new int[]{80,60,0}, - new int[]{0,42,44}}; - - int[] altCounts = new int[]{10,40,90}; - int[] altAlleleNum = new int[]{100,500,1000}; - - double[] expected_post_10_100 = new double[] { - 9.250326e-03, 3.020208e-01, 6.887289e-01, - 7.693433e-12, 1.000000e+00, 5.728111e-10, - 1.340156e-07, 2.192982e-03, 9.978069e-01, - 6.073718e-03, 9.938811e-01, 4.522159e-05, - 1.343101e-10, 2.197802e-07, 9.999998e-01, - 9.960193e-01, 1.028366e-03, 2.952290e-03 - }; - - double[] expected_post_10_500 = new double[] { - 4.226647e-04, 7.513277e-02, 9.244446e-01, - 1.413080e-12, 1.000000e+00, 3.090662e-09, - 4.570232e-09, 4.071661e-04, 9.995928e-01, - 1.120916e-03, 9.986339e-01, 2.451646e-04, - 4.572093e-12, 4.073320e-08, 1.000000e+00, - 9.151689e-01, 5.144399e-03, 7.968675e-02 - }; - - double[] expected_post_10_1000 = new double[] { - 1.077685e-04, 3.870477e-02, 9.611875e-01, - 6.994030e-13, 1.000000e+00, 6.237975e-09, - 1.120976e-09, 2.017756e-04, 9.997982e-01, - 5.549722e-04, 9.989500e-01, 4.949797e-04, - 1.121202e-12, 2.018163e-08, 1.000000e+00, - 7.318346e-01, 8.311615e-03, 2.598538e-01 - }; - - double[] expected_post_40_100 = new double[] { - 1.102354e-01, 6.437516e-01, 2.460131e-01, - 4.301328e-11, 1.000000e+00, 9.599306e-11, - 4.422850e-06, 1.294493e-02, 9.870507e-01, - 3.303763e-02, 9.669550e-01, 7.373032e-06, - 4.480868e-09, 1.311474e-06, 9.999987e-01, - 9.997266e-01, 1.846199e-04, 8.882157e-05 - }; - - double[] expected_post_40_500 = new double[] { - 5.711785e-03, 2.557266e-01, 7.385617e-01, - 5.610428e-12, 1.000000e+00, 7.254558e-10, - 7.720262e-08, 1.732352e-03, 9.982676e-01, - 4.436495e-03, 9.955061e-01, 5.736604e-05, - 7.733659e-11, 1.735358e-07, 9.999998e-01, - 9.934793e-01, 1.406575e-03, 5.114153e-03 - }; - - double[] expected_post_40_1000 = new double[] { - 1.522132e-03, 1.422229e-01, 8.562549e-01, - 2.688330e-12, 1.000000e+00, 1.512284e-09, - 1.776184e-08, 8.317737e-04, 9.991682e-01, - 2.130611e-03, 9.977495e-01, 1.198547e-04, - 1.777662e-11, 8.324661e-08, 9.999999e-01, - 9.752770e-01, 2.881677e-03, 2.184131e-02 - }; - - double[] expected_post_90_100 = new double[] { - 6.887289e-01, 3.020208e-01, 9.250326e-03, - 5.728111e-10, 1.000000e+00, 7.693433e-12, - 6.394346e-04, 1.405351e-01, 8.588255e-01, - 3.127146e-01, 6.872849e-01, 4.200075e-07, - 7.445327e-07, 1.636336e-05, 9.999829e-01, - 9.999856e-01, 1.386699e-05, 5.346906e-07 - }; - - double[] expected_post_90_500 = new double[] { - 2.528165e-02, 4.545461e-01, 5.201723e-01, - 1.397100e-11, 1.000000e+00, 2.874546e-10, - 4.839050e-07, 4.360463e-03, 9.956391e-01, - 1.097551e-02, 9.890019e-01, 2.258221e-05, - 4.860244e-10, 4.379560e-07, 9.999996e-01, - 9.986143e-01, 5.677671e-04, 8.179741e-04 - }; - - double[] expected_post_90_1000 = new double[] { - 7.035938e-03, 2.807708e-01, 7.121932e-01, - 6.294627e-12, 1.000000e+00, 6.371561e-10, - 9.859771e-08, 1.971954e-03, 9.980279e-01, - 4.974874e-03, 9.949748e-01, 5.035678e-05, - 9.879252e-11, 1.975850e-07, 9.999998e-01, - 9.947362e-01, 1.255272e-03, 4.008518e-03 - }; - - double[][] expectations = new double[][] { - expected_post_10_100, - expected_post_10_500, - expected_post_10_1000, - expected_post_40_100, - expected_post_40_500, - expected_post_40_1000, - expected_post_90_100, - expected_post_90_500, - expected_post_90_1000 - }; - - int testIndex = 0; - for ( int altCount : altCounts ) { - for ( int numAlt : altAlleleNum ) { - double[] knownCounts = new double[2]; - knownCounts[0] = altCount; - knownCounts[1] = numAlt-altCount; - int expected_index = 0; - for ( int gl_index = 0; gl_index < likelihood_PLs.length; gl_index++ ) { - double[] post = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(pl2gl(likelihood_PLs[gl_index]), knownCounts, 2); - for ( int i = 0; i < post.length; i++ ) { - double expected = expectations[testIndex][expected_index++]; - double observed = Math.pow(10.0,post[i]); - double err = Math.abs( (expected-observed)/expected ); - Assert.assertTrue(err < 1e-4, String.format("Counts: %s | Expected: %e | Observed: %e | pre %s | prior %s | post %s", - Arrays.toString(knownCounts), expected,observed, Arrays.toString(pl2gl(likelihood_PLs[gl_index])), - Arrays.toString(PairHMMLikelihoodCalculationEngine.getDirichletPrior(knownCounts,2)),Arrays.toString(post))); - } - } - testIndex++; - } - } - } - - private boolean arraysApproxEqual(double[] a, double[] b, double tol) { - if ( a.length != b.length ) { - return false; - } - - for ( int idx = 0; idx < a.length; idx++ ) { - if ( Math.abs(a[idx]-b[idx]) > tol ) { - return false; - } - } - - return true; - } - - private String errMsgArray(double[] a, double[] b) { - return String.format("Expected %s, Observed %s", Arrays.toString(a), Arrays.toString(b)); - } - - @Test - private void testPosteriorMultiAllelic() { - // AA AB BB AC BC CC AD BD CD DD - int[] PL_one = new int[] {40,20,30,0,15,25}; - int[] PL_two = new int[] {0,20,10,99,99,99}; - int[] PL_three = new int[] {50,40,0,30,30,10,20,40,80,50}; - int[] PL_four = new int[] {99,90,85,10,5,30,40,20,40,30,0,12,20,14,5}; - int[] PL_five = new int[] {60,20,30,0,40,10,8,12,18,22,40,12,80,60,20}; - double[] counts_one = new double[]{100.001,40.001,2.001}; - double[] counts_two = new double[]{2504.001,16.001,218.001}; - double[] counts_three = new double[]{10000.001,500.001,25.001,0.001}; - double[] counts_four = new double[]{4140.001,812.001,32.001,104.001,12.001}; - double[] counts_five = new double[]{80.001,40.001,8970.001,200.001,1922.001}; - - double expected_one[] = new double[] { -2.684035, -0.7852596, -2.4735, -0.08608339, -1.984017, -4.409852 }; - double expected_two[] = new double[] { -5.736189e-05, -3.893688, -5.362878, -10.65938, -12.85386, -12.0186}; - double expected_three[] = new double[] {-2.403234, -2.403276, -0.004467802, -2.70429, -4.005319, -3.59033, -6.102247, -9.403276, -14.70429, -13.40284}; - double expected_four[] = new double[] {-7.828677, -7.335196, -7.843136, -0.7395892, -0.947033, -5.139092, -3.227715, - -1.935159, -5.339552, -4.124552, -0.1655353, -2.072979, -4.277372, -3.165498, -3.469589 }; - double expected_five[] = new double[] { -9.170334, -5.175724, -6.767055, -0.8250021, -5.126027, -0.07628661, -3.276762, - -3.977787, -2.227065, -4.57769, -5.494041, -2.995066, -7.444344, -7.096104, -2.414187}; - - double[] post1 = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(pl2gl(PL_one),counts_one,2); - double[] post2 = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(pl2gl(PL_two),counts_two,2); - double[] post3 = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(pl2gl(PL_three),counts_three,2); - double[] post4 = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(pl2gl(PL_four),counts_four,2); - double[] post5 = PairHMMLikelihoodCalculationEngine.calculatePosteriorGLs(pl2gl(PL_five),counts_five,2); - - double[] expecPrior5 = new double[] {-4.2878195, -4.2932090, -4.8845400, -1.9424874, -2.2435120, -0.1937719, -3.5942477, - -3.8952723, -1.5445506, -3.4951749, -2.6115263, -2.9125508, -0.5618292, -2.2135895, - -1.5316722}; - - Assert.assertTrue(arraysApproxEqual(expecPrior5, PairHMMLikelihoodCalculationEngine.getDirichletPrior(counts_five,2),1e-5),errMsgArray(expecPrior5,PairHMMLikelihoodCalculationEngine.getDirichletPrior(counts_five,2))); - - Assert.assertTrue(arraysApproxEqual(expected_one,post1,1e-6),errMsgArray(expected_one,post1)); - Assert.assertTrue(arraysApproxEqual(expected_two,post2,1e-5),errMsgArray(expected_two,post2)); - Assert.assertTrue(arraysApproxEqual(expected_three,post3,1e-5),errMsgArray(expected_three,post3)); - Assert.assertTrue(arraysApproxEqual(expected_four,post4,1e-5),errMsgArray(expected_four,post4)); - Assert.assertTrue(arraysApproxEqual(expected_five,post5,1e-5),errMsgArray(expected_five,post5)); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java deleted file mode 100644 index d163c0497..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ /dev/null @@ -1,426 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.variant.variantcontext.GenotypeType; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -public class ReferenceConfidenceModelUnitTest extends BaseTest { - GenomeLocParser parser; - final String RGID = "ID1"; - GATKSAMReadGroupRecord rg; - final String sample = "NA12878"; - final Set samples = Collections.singleton(sample); - SAMFileHeader header; - ReferenceConfidenceModel model; - - @BeforeClass - public void setUp() throws Exception { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - rg = new GATKSAMReadGroupRecord(RGID); - rg.setSample(sample); - header.addReadGroup(rg); - parser = new GenomeLocParser(header.getSequenceDictionary()); - } - - @BeforeMethod - public void setupModel() { - model = new ReferenceConfidenceModel(parser, samples, header, 10); - } - - @DataProvider(name = "CalcNIndelInformativeReadsData") - public Object[][] makeMyDataProvider() { - List tests = new ArrayList<>(); - - { // very basic testing - final String ref = "ACGT"; - final String read = "ACGT"; - tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)}); - tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)}); - tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)}); - tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)}); - } - - { // actually interesting case where some sites aren't informative - final String ref = "NNAAAANN"; - final String read1 = "NNA"; - final String read2 = "NNAA"; - final String read3 = "NNAAA"; - final String read4 = "NNAAAA"; - final String read5 = "NNAAAAN"; - tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)}); - } - - { - for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) { - final String anchor = Utils.dupString("N", repeatUnit.length()); - for ( int nUnits = 1; nUnits < 10; nUnits++ ) { - final String repeat = Utils.dupString(repeatUnit, nUnits); - final String ref = anchor + repeat + anchor; - for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) { - final String read = anchor + repeat.substring(0, readLen); - final List expected = new LinkedList<>(); - for ( int i = 0; i < anchor.length(); i++ ) expected.add(1); - for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0); - for ( int i = 0; i < anchor.length(); i++ ) expected.add(0); - tests.add(new Object[]{read, ref, repeatUnit.length(), expected}); - - final List result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1)); - result.addAll(Collections.nCopies(anchor.length(), 0)); - tests.add(new Object[]{ref, ref, repeatUnit.length(), result}); - } - } - - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CalcNIndelInformativeReadsData") - public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List expected ) { - final byte qual = (byte)30; - final byte[] quals = Utils.dupBytes(qual, readBases.length()); - - for ( int i = 0; i < readBases.getBytes().length; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M"); - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i); - final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i); - final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize); - Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i); - } - } - - @Test - public void testCalcNIndelInformativeReducedReads() { - final String bases = "ACGGGTTTGGAC"; - final byte[] quals = Utils.dupBytes((byte)30, bases.length()); - final int count = 10; - final int[] counts = new int[bases.length()]; - for ( int i = 0; i < counts.length; i++ ) - counts[i] = count; - final int position = 100; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, position, counts.length, counts); - read.setReadString(bases); - read.setBaseQualities(quals); - read.setCigarString(bases.length() + "M"); - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, position, position); - final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), 0); - final int actual = model.calcNIndelInformativeReads(pileup, 0, bases.getBytes(), 3); - Assert.assertEquals(actual, count); - } - - @Test - public void testClose() { - model.close(); - } - - @Test - public void testWorstGL() { - final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100"); - final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200"); - final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200"); - - Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10); - Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10); - Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0); - Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0); - } - - @Test - public void testIndelLikelihoods() { - GenotypeLikelihoods prev = model.getIndelPLs(0); - Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); - Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); - - for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { - final GenotypeLikelihoods current = model.getIndelPLs(i); - final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); - final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); - Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i); - Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i); - Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i); -// logger.warn("result at " + i + " is " + current); - prev = current; - } - } - - @Test - public void testOverlappingVariantContext() { - final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C")); - final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C")); - final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A")); - final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT")); - - final List calls = Arrays.asList(vc13, vc12_15, vc18, vc10); - - checkOverlapping(8, calls, null); - checkOverlapping(9, calls, null); - checkOverlapping(10, calls, vc10); - checkOverlapping(11, calls, null); - checkOverlapping(12, calls, vc12_15); - checkOverlapping(13, calls, vc13); - checkOverlapping(14, calls, vc12_15); - checkOverlapping(15, calls, vc12_15); - checkOverlapping(16, calls, null); - checkOverlapping(17, calls, null); - checkOverlapping(18, calls, vc18); - checkOverlapping(19, calls, null); - checkOverlapping(20, calls, null); - } - - private void checkOverlapping(final int pos, Collection calls, final VariantContext expected) { - final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos); - final VariantContext actual = model.getOverlappingVariantContext(loc, calls); - Assert.assertEquals(actual, expected); - } - - // - // test reference calculation - // - private class RefConfData { - final String ref; - final int extension; - final Haplotype refHap; - final GenomeLoc refLoc, paddedRefLoc; - final ActiveRegion region; - int readCounter = 0; - - private RefConfData(String ref, int extension) { - this.ref = ref; - this.extension = extension; - - refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd()); - paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension); - region = new ActiveRegion(getRefLoc(), parser, extension); - final String pad = Utils.dupString("N", extension); - refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc()); - } - - public GenomeLoc getRefLoc() { return refLoc; } - public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; } - public ActiveRegion getActiveRegion() { return region; } - public Haplotype getRefHap() { return refHap; } - public int getStart() { return 100; } - public int getEnd() { return getStart() + getRefLength() - 1; } - public byte[] getRefBases() { return ref.getBytes(); } - public int getRefLength() { return ref.length(); } - - public GATKSAMRecord makeRead(final int start, final int length) { - final byte[] quals = Utils.dupBytes((byte)30, length); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M"); - read.setReadGroup(rg); - return read; - } - } - - - @DataProvider(name = "RefConfidenceData") - public Object[][] makeRefConfidenceData() { - List tests = new ArrayList<>(); - - for ( int i = 0; i < 10; i++ ) { - for ( final int extension : Arrays.asList(0, 10) ) { - tests.add(new Object[]{i, extension}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "RefConfidenceData") - public void testRefConfidenceBasic(final int nReads, final int extension) { - final RefConfData data = new RefConfData("ACGTAACCGGTT", extension); - final List haplotypes = Arrays.asList(data.getRefHap()); - final List calls = Collections.emptyList(); - - for ( int i = 0; i < nReads; i++ ) { - data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); - } - - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); - - final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); - checkReferenceModelResult(data, contexts, expectedDPs, calls); - } - - @Test - public void testRefConfidencePartialReads() { - final String ref = "ACGTAACCGGTT"; - for ( int readLen = 3; readLen < ref.length(); readLen++ ) { - for ( int start = 0; start < ref.length() - readLen; start++ ) { - final RefConfData data = new RefConfData(ref, 0); - final List haplotypes = Arrays.asList(data.getRefHap()); - final List calls = Collections.emptyList(); - - data.getActiveRegion().add(data.makeRead(start, readLen)); - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); - - final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); - for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); - checkReferenceModelResult(data, contexts, expectedDPs, calls); - } - } - } - - @Test - public void testRefConfidenceWithCalls() { - final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0); - final int start = xxxdata.getStart(); - final int stop = xxxdata.getEnd(); - - for ( int nReads = 0; nReads < 2; nReads++ ) { - - final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C")); - final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C")); - final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C")); - final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A")); - final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG")); - - final List allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns); - - for ( int n = 1; n <= allCalls.size(); n++ ) { - for ( final List calls : Utils.makePermutations(allCalls, n, false) ) { -// logger.warn("Executing " + n + " " + calls.size()); - final RefConfData data = new RefConfData("ACGTAACCGGTT", 0); - final List haplotypes = Arrays.asList(data.getRefHap()); - for ( int i = 0; i < nReads; i++ ) { - data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); - } - - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); - - final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); - checkReferenceModelResult(data, contexts, expectedDPs, calls); - } - } - } - } - - private void checkReferenceModelResult(final RefConfData data, final List contexts, final List expectedDPs, final List calls) { - Assert.assertNotNull(contexts); - - final GenomeLoc loc = data.getActiveRegion().getExtendedLoc(); - final List seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false)); - - for ( int i = 0; i < loc.size(); i++ ) { - final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i); - final VariantContext call = model.getOverlappingVariantContext(curPos, calls); - final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts); - - if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) { - // part of the extended interval, but not the full interval - Assert.assertNull(refModel); - continue; - } - - if ( call != null ) { - Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead"); - } else { - final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart()); - Assert.assertEquals(refModel.getStart(), loc.getStart() + i); - Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); - Assert.assertFalse(refModel.hasLog10PError()); - Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); - Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - Assert.assertTrue(refModel.hasGenotype(sample)); - - final Genotype g = refModel.getGenotype(sample); - Assert.assertTrue(g.hasAD()); - Assert.assertTrue(g.hasDP()); - Assert.assertEquals(g.getDP(), expectedDP); - Assert.assertTrue(g.hasGQ()); - Assert.assertTrue(g.hasPL()); - } - - final VariantContext vc = call == null ? refModel : call; - if ( curPos.getStart() == vc.getStart() ) { - for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) { - final int j = pos - data.getActiveRegion().getLocation().getStart(); - Assert.assertFalse(seenBP.get(j)); - seenBP.set(j, true); - } - } - } - - for ( int i = 0; i < seenBP.size(); i++ ) { - Assert.assertEquals((boolean)seenBP.get(i), true); - } - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java deleted file mode 100644 index 5e91ad4f7..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java +++ /dev/null @@ -1,310 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -public class BaseGraphUnitTest extends BaseTest { - SeqGraph graph; - SeqVertex v1, v2, v3, v4, v5; - - @BeforeMethod - public void setUp() throws Exception { - graph = new SeqGraph(11); - - v1 = new SeqVertex("A"); - v2 = new SeqVertex("C"); - v3 = new SeqVertex("C"); - v4 = new SeqVertex("C"); - v5 = new SeqVertex("C"); - - graph.addVertices(v1, v2, v3, v4, v5); - graph.addEdge(v1, v2); - graph.addEdge(v2, v4); - graph.addEdge(v3, v2); - graph.addEdge(v2, v3); - graph.addEdge(v4, v5); - } - - @Test - public void testIncomingAndOutgoingVertices() throws Exception { - assertVertexSetEquals(graph.outgoingVerticesOf(v1), v2); - assertVertexSetEquals(graph.incomingVerticesOf(v1)); - - assertVertexSetEquals(graph.outgoingVerticesOf(v2), v3, v4); - assertVertexSetEquals(graph.incomingVerticesOf(v2), v1, v3); - - assertVertexSetEquals(graph.outgoingVerticesOf(v3), v2); - assertVertexSetEquals(graph.incomingVerticesOf(v3), v2); - - assertVertexSetEquals(graph.outgoingVerticesOf(v4), v5); - assertVertexSetEquals(graph.incomingVerticesOf(v4), v2); - - assertVertexSetEquals(graph.outgoingVerticesOf(v5)); - assertVertexSetEquals(graph.incomingVerticesOf(v5), v4); - } - - @Test - public void testRemoveSingletonOrphanVertices() throws Exception { - // all vertices in graph are connected - final List kept = new LinkedList(graph.vertexSet()); - final SeqVertex rm1 = new SeqVertex("CAGT"); - final SeqVertex rm2 = new SeqVertex("AGTC"); - graph.addVertices(rm1, rm2); - Assert.assertEquals(graph.vertexSet().size(), kept.size() + 2); - final BaseEdge rm12e = new BaseEdge(false, 1); - graph.addEdge(rm1, rm2, rm12e); - - final SeqGraph original = (SeqGraph)graph.clone(); - graph.removeSingletonOrphanVertices(); - Assert.assertTrue(BaseGraph.graphEquals(original, graph), "Graph with disconnected component but edges between components shouldn't be modified"); - - graph.removeEdge(rm12e); // now we should be able to remove rm1 and rm2 - graph.removeSingletonOrphanVertices(); - Assert.assertTrue(graph.vertexSet().containsAll(kept)); - Assert.assertFalse(graph.containsVertex(rm1)); - Assert.assertFalse(graph.containsVertex(rm2)); - } - - @Test - public void testRemoveSingletonOrphanVerticesOnSingleRefNode() throws Exception { - final SeqGraph original = new SeqGraph(11); - original.addVertex(v1); - original.removeSingletonOrphanVertices(); - Assert.assertTrue(original.containsVertex(v1)); - Assert.assertEquals(original.vertexSet().size(), 1); - } - - @Test - public void testIsRefSourceAndSink() throws Exception { - - final SeqGraph g = new SeqGraph(11); - g.addVertex(v1); - Assert.assertTrue(g.isRefSource(v1)); - Assert.assertTrue(g.isRefSink(v1)); - Assert.assertTrue(g.isReferenceNode(v1)); - - g.addVertices(v2, v3, v4, v5); - g.addEdge(v1, v2); - g.addEdge(v2, v3); - final BaseEdge refEdge = new BaseEdge(true, 1); - g.addEdge(v3, v4, refEdge); - g.addEdge(v4, v5); - - Assert.assertFalse(g.isRefSource(v1)); - Assert.assertFalse(g.isRefSink(v1)); - Assert.assertFalse(g.isReferenceNode(v1)); - - Assert.assertFalse(g.isRefSource(v2)); - Assert.assertFalse(g.isRefSink(v2)); - Assert.assertFalse(g.isReferenceNode(v2)); - - Assert.assertTrue(g.isRefSource(v3)); - Assert.assertFalse(g.isRefSink(v3)); - Assert.assertTrue(g.isReferenceNode(v3)); - - Assert.assertFalse(g.isRefSource(v4)); - Assert.assertTrue(g.isRefSink(v4)); - Assert.assertTrue(g.isReferenceNode(v4)); - - Assert.assertFalse(g.isRefSource(v5)); - Assert.assertFalse(g.isRefSink(v5)); - Assert.assertFalse(g.isReferenceNode(v5)); - } - - @Test - public void testRemovePathsNotConnectedToRef() throws Exception { - final SeqGraph graph = new SeqGraph(11); - - SeqVertex src = new SeqVertex("A"); - SeqVertex end = new SeqVertex("A"); - SeqVertex g1 = new SeqVertex("C"); - SeqVertex g2 = new SeqVertex("G"); - SeqVertex g3 = new SeqVertex("T"); - SeqVertex g4 = new SeqVertex("AA"); - SeqVertex g5 = new SeqVertex("AA"); - SeqVertex g6 = new SeqVertex("AA"); - SeqVertex g8 = new SeqVertex("AA"); - SeqVertex g7 = new SeqVertex("AA"); - SeqVertex b1 = new SeqVertex("CC"); - SeqVertex b2 = new SeqVertex("GG"); - SeqVertex b3 = new SeqVertex("TT"); - SeqVertex b4 = new SeqVertex("AAA"); - SeqVertex b5 = new SeqVertex("CCC"); - SeqVertex b6 = new SeqVertex("GGG"); - SeqVertex b7 = new SeqVertex("AAAA"); - SeqVertex b8 = new SeqVertex("GGGG"); - SeqVertex b9 = new SeqVertex("CCCC"); - - graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8); - graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); - graph.addEdges(src, g1, g5, g6, g7, end); - graph.addEdges(src, g1, g5, g8, g7, end); - graph.addEdges(src, g1, g3, end); - - // the current state of the graph is the good one - final SeqGraph good = (SeqGraph)graph.clone(); - - // now add the bads to the graph - graph.addVertices(b1, b2, b3, b4, b5, b6, b7, b8, b9); - graph.addEdges(src, b1); // source -> b1 is dead - graph.addEdges(b6, src); // x -> source is bad - graph.addEdges(g4, b2); // off random vertex is bad - graph.addEdges(g3, b3, b4); // two vertices that don't connect to end are bad - graph.addEdges(end, b5); // vertex off end is bad - graph.addEdges(g3, b7, b8, b7); // cycle is bad - graph.addEdges(g3, b9, b9); // self-cycle is bad - - final boolean debug = false; - if ( debug ) good.printGraph(new File("expected.dot"), 0); - if ( debug ) graph.printGraph(new File("bad.dot"), 0); - graph.removePathsNotConnectedToRef(); - if ( debug ) graph.printGraph(new File("actual.dot"), 0); - - Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); - } - - @Test - public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception { - final SeqGraph graph = new SeqGraph(11); - - SeqVertex src = new SeqVertex("A"); - SeqVertex end = new SeqVertex("A"); - SeqVertex g1 = new SeqVertex("C"); - SeqVertex g2 = new SeqVertex("G"); - SeqVertex g3 = new SeqVertex("T"); - SeqVertex g4 = new SeqVertex("AA"); - SeqVertex g5 = new SeqVertex("AA"); - SeqVertex g6 = new SeqVertex("AA"); - SeqVertex g8 = new SeqVertex("AA"); - SeqVertex g7 = new SeqVertex("AA"); - SeqVertex gPrev = new SeqVertex("AA"); - SeqVertex gPrev1 = new SeqVertex("AA"); - SeqVertex gPrev2 = new SeqVertex("AA"); - SeqVertex gAfter = new SeqVertex("AA"); - SeqVertex gAfter1 = new SeqVertex("AA"); - SeqVertex gAfter2 = new SeqVertex("AA"); - SeqVertex b1 = new SeqVertex("CC"); - SeqVertex b2 = new SeqVertex("GG"); - SeqVertex b3 = new SeqVertex("TT"); - SeqVertex b4 = new SeqVertex("AAA"); - SeqVertex b5 = new SeqVertex("CCC"); - SeqVertex b6 = new SeqVertex("GGG"); - - graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2); - graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); - graph.addEdges(src, g1, g5, g6, g7, end); - graph.addEdges(src, g1, g5, g8, g7, end); - graph.addEdges(src, g1, g3, end); - - // these should be kept, but are in the wrong direction - graph.addEdges(gPrev, src); - graph.addEdges(gPrev1, gPrev2, src); - graph.addEdges(end, gAfter); - graph.addEdges(end, gAfter1, gAfter2); - - // the current state of the graph is the good one - final SeqGraph good = (SeqGraph)graph.clone(); - - // now add the bads to the graph - graph.addVertices(b1, b2, b3, b4, b5, b6); - graph.addEdges(b2, b3); // b2 -> b3 - graph.addEdges(b4, b5, b4); // cycle - graph.addEdges(b6, b6); // isolated self cycle - - final boolean debug = false; - if ( debug ) good.printGraph(new File("expected.dot"), 0); - if ( debug ) graph.printGraph(new File("bad.dot"), 0); - graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); - if ( debug ) graph.printGraph(new File("actual.dot"), 0); - - Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); - } - - @Test - public void testPrintEmptyGraph() throws Exception { - final File tmp = File.createTempFile("tmp", "dot"); - tmp.deleteOnExit(); - new SeqGraph(11).printGraph(tmp, 10); - new TestGraph().printGraph(tmp, 10); - } - - @Test - public void testComplexGraph() throws Exception { - final File tmp = File.createTempFile("tmp", "dot"); - tmp.deleteOnExit(); - graph.printGraph(tmp, 10); - } - - private void assertVertexSetEquals(final Collection actual, final SeqVertex ... expected) { - final Set actualSet = new HashSet(actual); - Assert.assertEquals(actualSet.size(), actual.size(), "Duplicate elements found in vertex list"); - final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); - Assert.assertEquals(actualSet, expectedSet); - } - - @Test(enabled = true) - public void testGetBases() { - - final int kmerSize = 4; - final String testString = "AATGGGGGCAATACTA"; - - final List vertexes = new ArrayList<>(); - for ( int i = 0; i <= testString.length() - kmerSize; i++ ) { - vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize))); - } - - final String result = new String(new TestGraph().getBasesForPath(vertexes)); - Assert.assertEquals(result, testString.substring(kmerSize - 1)); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java deleted file mode 100644 index 63fd21d8f..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ /dev/null @@ -1,185 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -public class CommonSuffixMergerUnitTest extends BaseTest { - private final static boolean PRINT_GRAPHS = true; - - @DataProvider(name = "CompleteCycleData") - public Object[][] makeCompleteCycleData() { - return makeSplitMergeData(-1); - } - - public static class SplitMergeData { - final SeqGraph graph; - final SeqVertex v; - final String commonSuffix; - - public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) { - this.graph = graph; - this.v = v; - this.commonSuffix = commonSuffix; - } - - @Override - public String toString() { - return "SplitMergeData{" + - "graph=" + graph + - ", v=" + v + - ", commonSuffix='" + commonSuffix + '\'' + - '}'; - } - } - - public static Object[][] makeSplitMergeData(final int maxTests) { - List tests = new ArrayList(); - - final List bases = Arrays.asList("A", "C", "G", "T"); - for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) { - for ( final int nBots : Arrays.asList(0, 1, 2) ) { - for ( final int nMids : Arrays.asList(1, 2, 3) ) { - for ( int nTops = 0; nTops < nMids; nTops++ ) { - for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) { - int multi = 1; - final SeqGraph graph = new SeqGraph(11); - final SeqVertex v = new SeqVertex("GGGG"); - graph.addVertex(v); - - final LinkedList tops = new LinkedList(); - final LinkedList mids = new LinkedList(); - - for ( int i = 0; i < nMids; i++) { - final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix); - graph.addVertex(mid); - graph.addEdge(mid, v, new BaseEdge(i == 0, multi++)); - mids.add(mid); - - tops.add(new SeqVertex(bases.get(i))); - } - - graph.addVertices(tops); - for ( final SeqVertex t : tops ) { - for ( int i = 0; i < nTopConnections; i++ ) { - graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++)); - } - } - - for ( int i = 0; i < nBots; i++ ) { - final SeqVertex bot = new SeqVertex(bases.get(i)); - graph.addVertex(bot); - graph.addEdge(v, bot, new BaseEdge(i == 0, multi++)); - - } - - tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)}); - } - } - } - } - } - - final List toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests)); - return toUse.toArray(new Object[][]{}); - } - - public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { - try { - final Set haplotypes = new HashSet(); - final List> originalPaths = new KBestPaths().getKBestPaths(original); - for ( final Path path : originalPaths ) - haplotypes.add(new String(path.getBases())); - - final List> splitPaths = new KBestPaths().getKBestPaths(actual); - for ( final Path path : splitPaths ) { - final String h = new String(path.getBases()); - Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); - } - - if ( splitPaths.size() == originalPaths.size() ) { - for ( int i = 0; i < originalPaths.size(); i++ ) { - Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); - } - } - } catch ( AssertionError e ) { - if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0); - if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0); - throw e; - } - } - - @Test(dataProvider = "CompleteCycleData") - public void testMerging(final SplitMergeData data) { - final SeqGraph original = (SeqGraph)data.graph.clone(); - final SharedSequenceMerger splitter = new SharedSequenceMerger(); - splitter.merge(data.graph, data.v); - assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); - } - - @Test - public void testDoesntMergeSourceNodes() { - final SeqGraph g = new SeqGraph(11); - final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("A"); - final SeqVertex v3 = new SeqVertex("A"); - final SeqVertex top = new SeqVertex("T"); - final SeqVertex b = new SeqVertex("C"); - g.addVertices(top, v1, v2, v3, top, b); - g.addEdges(top, v1, b); - g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged - g.addEdges(top, v3, b); - final SharedSequenceMerger merger = new SharedSequenceMerger(); - Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java deleted file mode 100644 index fa7ad9a3d..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ /dev/null @@ -1,560 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.TextCigarCodec; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 1/31/13 - */ - -public class KBestPathsUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - @DataProvider(name = "BasicPathFindingData") - public Object[][] makeBasicPathFindingData() { - List tests = new ArrayList(); - for ( final boolean allowCycles : Arrays.asList(false, true)) { - for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { - for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { - for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { - for ( final boolean addCycle : Arrays.asList(true, false) ) { - tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle, allowCycles}); - } - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - private static int weight = 1; - final Set createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { - final List seqs = Arrays.asList("A", "C", "G", "T"); - final Set vertices = new LinkedHashSet(); - for ( int i = 0; i < n; i++ ) { - final SeqVertex v = new SeqVertex(seqs.get(i)); - graph.addVertex(v); - vertices.add(v); - if ( source != null ) graph.addEdge(source, v, new BaseEdge(false, weight++)); - if ( target != null ) graph.addEdge(v, target, new BaseEdge(false, weight++)); - } - return vertices; - } - - @Test(dataProvider = "BasicPathFindingData", enabled = !DEBUG) - public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle, final boolean allowCycles) { - SeqGraph graph = new SeqGraph(11); - - final SeqVertex middleTop = new SeqVertex("GTAC"); - final SeqVertex middleBottom = new SeqVertex("ACTG"); - graph.addVertices(middleTop, middleBottom); - final Set starts = createVertices(graph, nStartNodes, null, middleTop); - final Set bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); - final Set ends = createVertices(graph, nEndNodes, middleBottom, null); - - if ( addCycle ) graph.addEdge(middleBottom, middleBottom); - - // enumerate all possible paths - final List> paths = new KBestPaths(allowCycles).getKBestPaths(graph, starts, ends); - - final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle && allowCycles ? 2 : 1) * nEndNodes; - Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); - - int lastScore = Integer.MAX_VALUE; - for ( final Path path : paths ) { - Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore); - lastScore = path.getScore(); - } - - // get the best path, and make sure it's the same as our optimal path overall - final Path best = paths.get(0); - final List> justOne = new KBestPaths(allowCycles).getKBestPaths(graph, 1, starts, ends); - Assert.assertEquals(justOne.size(), 1); - Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); - } - - @Test(enabled = !DEBUG) - public void testPathFindingComplexCycle() { - SeqGraph graph = new SeqGraph(11); - - final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("G"); - final SeqVertex v4 = new SeqVertex("T"); - final SeqVertex v5 = new SeqVertex("AA"); - graph.addVertices(v1, v2, v3, v4, v5); - graph.addEdges(v1, v2, v3, v4, v5); - graph.addEdges(v3, v3); - graph.addEdges(v4, v2); - - // enumerate all possible paths - final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v5); - - Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); - } - - @Test(enabled = !DEBUG) - public void testPathFindingCycleLastNode() { - SeqGraph graph = new SeqGraph(11); - - final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("G"); - graph.addVertices(v1, v2, v3); - graph.addEdges(v1, v2, v3, v3); - - // enumerate all possible paths - final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v3); - - Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); - } - - @DataProvider(name = "BasicBubbleDataProvider") - public Object[][] makeBasicBubbleDataProvider() { - List tests = new ArrayList(); - for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { - for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { - tests.add(new Object[]{refBubbleLength, altBubbleLength}); - } - } - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BasicBubbleDataProvider", enabled = !DEBUG) - public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { - // Construct the assembly graph - SeqGraph graph = new SeqGraph(3); - final String preRef = "ATGG"; - final String postRef = "GGGGC"; - - SeqVertex v = new SeqVertex(preRef); - SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); - SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); - SeqVertex v3 = new SeqVertex(postRef); - - graph.addVertex(v); - graph.addVertex(v2Ref); - graph.addVertex(v2Alt); - graph.addVertex(v3); - graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); - graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); - graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); - graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); - - // Construct the test path - Path path = new Path(v, graph); - path = new Path(path, graph.getEdge(v, v2Alt)); - path = new Path(path, graph.getEdge(v2Alt, v3)); - - // Construct the actual cigar string implied by the test path - Cigar expectedCigar = new Cigar(); - expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); - if( refBubbleLength > altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); - expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); - } else if ( refBubbleLength < altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); - } else { - expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - } - expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); - - final String ref = preRef + v2Ref.getSequenceString() + postRef; - Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); - } - - @DataProvider(name = "GetBasesData") - public Object[][] makeGetBasesData() { - List tests = new ArrayList(); - - final List frags = Arrays.asList("ACT", "GAC", "CAT"); - - for ( int n = 1; n <= frags.size(); n++ ) { - for ( final List comb : Utils.makePermutations(frags, n, false) ) { - tests.add(new Object[]{comb}); - } - } - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "GetBasesData", enabled = !DEBUG) - public void testGetBases(final List frags) { - // Construct the assembly graph - SeqGraph graph = new SeqGraph(3); - - SeqVertex prev = null; - for ( int i = 0; i < frags.size(); i++ ) { - SeqVertex v = new SeqVertex(frags.get(i)); - graph.addVertex(v); - if ( prev != null ) - graph.addEdge(prev, v); - prev = v; - } - - // enumerate all possible paths - final List> paths = new KBestPaths().getKBestPaths(graph); - Assert.assertEquals(paths.size(), 1); - final Path path = paths.get(0); - Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence"); - } - - @DataProvider(name = "TripleBubbleDataProvider") - public Object[][] makeTripleBubbleDataProvider() { - List tests = new ArrayList(); - for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { - for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { - for ( final boolean offRefEnding : Arrays.asList(true, false) ) { - for ( final boolean offRefBeginning : Arrays.asList(false) ) { - tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding}); - } - } - } - } - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "TripleBubbleDataProvider", enabled = !DEBUG) - public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { - // Construct the assembly graph - SeqGraph graph = new SeqGraph(11); - final String preAltOption = "ATCGATCGATCGATCGATCG"; - final String postAltOption = "CCCC"; - final String preRef = "ATGG"; - final String postRef = "GGCCG"; - final String midRef1 = "TTCCT"; - final String midRef2 = "CCCAAAAAAAAAAAA"; - - SeqVertex preV = new SeqVertex(preAltOption); - SeqVertex v = new SeqVertex(preRef); - SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); - SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); - SeqVertex v4Ref = new SeqVertex(Utils.dupString('C', refBubbleLength)); - SeqVertex v4Alt = new SeqVertex(Utils.dupString('C', altBubbleLength-1) + "T"); - SeqVertex v6Ref = new SeqVertex(Utils.dupString('G', refBubbleLength)); - SeqVertex v6Alt = new SeqVertex(Utils.dupString('G', altBubbleLength-1) + "T"); - SeqVertex v3 = new SeqVertex(midRef1); - SeqVertex v5 = new SeqVertex(midRef2); - SeqVertex v7 = new SeqVertex(postRef); - SeqVertex postV = new SeqVertex(postAltOption); - - final String ref = preRef + v2Ref.getSequenceString() + midRef1 + v4Ref.getSequenceString() + midRef2 + v6Ref.getSequenceString() + postRef; - - graph.addVertex(preV); - graph.addVertex(v); - graph.addVertex(v2Ref); - graph.addVertex(v2Alt); - graph.addVertex(v3); - graph.addVertex(v4Ref); - graph.addVertex(v4Alt); - graph.addVertex(v5); - graph.addVertex(v6Ref); - graph.addVertex(v6Alt); - graph.addVertex(v7); - graph.addVertex(postV); - graph.addEdge(preV, v, new BaseEdge(false, 1)); - graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); - graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); - graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); - graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); - graph.addEdge(v3, v4Ref, new BaseEdge(true, 10)); - graph.addEdge(v4Ref, v5, new BaseEdge(true, 10)); - graph.addEdge(v3, v4Alt, new BaseEdge(false, 5)); - graph.addEdge(v4Alt, v5, new BaseEdge(false, 5)); - graph.addEdge(v5, v6Ref, new BaseEdge(true, 11)); - graph.addEdge(v6Ref, v7, new BaseEdge(true, 11)); - graph.addEdge(v5, v6Alt, new BaseEdge(false, 55)); - graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); - graph.addEdge(v7, postV, new BaseEdge(false, 1)); - - // Construct the test path - Path path = new Path( (offRefBeginning ? preV : v), graph); - if( offRefBeginning ) { - path = new Path(path, graph.getEdge(preV, v)); - } - path = new Path(path, graph.getEdge(v, v2Alt)); - path = new Path(path, graph.getEdge(v2Alt, v3)); - path = new Path(path, graph.getEdge(v3, v4Ref)); - path = new Path(path, graph.getEdge(v4Ref, v5)); - path = new Path(path, graph.getEdge(v5, v6Alt)); - path = new Path(path, graph.getEdge(v6Alt, v7)); - if( offRefEnding ) { - path = new Path(path, graph.getEdge(v7,postV)); - } - - // Construct the actual cigar string implied by the test path - Cigar expectedCigar = new Cigar(); - if( offRefBeginning ) { - expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I)); - } - expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); - // first bubble - if( refBubbleLength > altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); - expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M)); - } else if ( refBubbleLength < altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M)); - expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); - } else { - expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - } - expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M)); - // second bubble is ref path - expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M)); - // third bubble - if( refBubbleLength > altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); - expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M)); - } else if ( refBubbleLength < altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M)); - expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); - } else { - expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - } - expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); - if( offRefEnding ) { - expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I)); - } - - Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), - AlignmentUtils.consolidateCigar(expectedCigar).toString(), - "Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases())); - } - - @Test(enabled = !DEBUG) - public void testIntraNodeInsertionDeletion() { - // Construct the assembly graph - SeqGraph graph = new SeqGraph(11); - final SeqVertex top = new SeqVertex("T"); - final SeqVertex bot = new SeqVertex("T"); - final SeqVertex alt = new SeqVertex("AAACCCCC"); - final SeqVertex ref = new SeqVertex("CCCCCGGG"); - - graph.addVertices(top, bot, alt, ref); - graph.addEdges(new BaseEdge(true, 1), top, ref, bot); - graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph, top, bot); - - Assert.assertEquals(paths.size(), 2); - - final Path refPath = paths.get(0); - final Path altPath = paths.get(1); - - final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); - Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "10M"); - Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "1M3I5M3D1M"); - } - - @Test(enabled = !DEBUG) - public void testHardSWPath() { - // Construct the assembly graph - SeqGraph graph = new SeqGraph(11); - final SeqVertex top = new SeqVertex( "NNN" ); - final SeqVertex bot = new SeqVertex( "NNN" ); - final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); - final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); - graph.addVertices(top, bot, alt, ref); - graph.addEdges(new BaseEdge(true, 1), top, ref, bot); - graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph, top, bot); - - Assert.assertEquals(paths.size(), 2); - - final Path refPath = paths.get(0); - final Path altPath = paths.get(1); - - final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); - - logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar(refString.getBytes())); - logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar(refString.getBytes())); - - Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "51M"); - Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "3M6I48M"); - } - - // ----------------------------------------------------------------- - // - // Systematic tests to ensure that we get the correct SW result for - // a variety of variants in the ref vs alt bubble - // - // ----------------------------------------------------------------- - - @DataProvider(name = "SystematicRefAltSWTestData") - public Object[][] makeSystematicRefAltSWTestData() { - List tests = new ArrayList(); - - final List> allDiffs = Arrays.asList( - Arrays.asList("G", "C", "1M"), - Arrays.asList("G", "", "1D"), - Arrays.asList("", "C", "1I"), - Arrays.asList("AAA", "CGT", "3M"), - Arrays.asList("TAT", "CAC", "3M"), - Arrays.asList("GCTG", "GTCG", "4M"), - Arrays.asList("AAAAA", "", "5D"), - Arrays.asList("", "AAAAA", "5I"), - Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I") - ); - - for ( final String prefix : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { - for ( final String end : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { - for ( final List diffs : allDiffs ) - tests.add(new Object[]{prefix, end, diffs.get(0), diffs.get(1), diffs.get(2)}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "SystematicRefAltSWTestData", enabled = !DEBUG) - public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) { - // Construct the assembly graph - SeqGraph graph = new SeqGraph(11); - - final int padSize = 0; - SeqVertex top = new SeqVertex(Utils.dupString("N", padSize)); - SeqVertex ref = new SeqVertex(prefix + refMid + end); - SeqVertex alt = new SeqVertex(prefix + altMid + end); - SeqVertex bot = new SeqVertex(Utils.dupString("N", padSize)); - - graph.addVertices(top, ref, alt, bot); - graph.addEdges(new BaseEdge(true, 1), top, ref, bot); - graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - - // Construct the test path - Path path = Path.makePath(Arrays.asList(top, alt, bot), graph); - - Cigar expected = new Cigar(); - expected.add(new CigarElement(padSize, CigarOperator.M)); - if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M)); - for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt); - if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M)); - expected.add(new CigarElement(padSize, CigarOperator.M)); - expected = AlignmentUtils.consolidateCigar(expected); - - final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); - final Cigar pathCigar = path.calculateCigar(refString.getBytes()); - - logger.warn("diffs: " + ref + " vs. " + alt + " cigar " + midCigar); - logger.warn("Path " + path + " with cigar " + pathCigar); - logger.warn("Expected cigar " + expected); - - Assert.assertEquals(pathCigar, expected, "Cigar mismatch: ref = " + refString + " vs alt = " + new String(path.getBases())); - } - - @Test(enabled = !DEBUG) - public void testLeftAlignCigarSequentially() { - String preRefString = "GATCGATCGATC"; - String postRefString = "TTT"; - String refString = "ATCGAGGAGAGCGCCCCG"; - String indelString1 = "X"; - String indelString2 = "YZ"; - int refIndel1 = 10; - int refIndel2 = 12; - - for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) { - for ( final int indelOp1 : Arrays.asList(1, -1) ) { - for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) { - for ( final int indelOp2 : Arrays.asList(1, -1) ) { - - Cigar expectedCigar = new Cigar(); - expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); - expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); - expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M)); - expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); - expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); - expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M)); - expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); - - Cigar givenCigar = new Cigar(); - givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M)); - givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); - givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M)); - givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); - givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M)); - - String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; - String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; - - Cigar calculatedCigar = Path.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); - Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); - } - } - } - } - } - - @Test(enabled = true) - public void testLeftAlignCigarSequentiallyAdjacentID() { - final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT"; - final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT"; - final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M"); - - final Cigar result = Path.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); - logger.warn("Result is " + result); - Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java deleted file mode 100644 index 2918501b2..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java +++ /dev/null @@ -1,261 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import org.broadinstitute.sting.BaseTest; -import org.jgrapht.EdgeFactory; -import org.testng.Assert; -import org.testng.Reporter; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Created with IntelliJ IDEA. - * User: valentin - * Date: 9/5/13 - * Time: 11:04 AM - * To change this template use File | Settings | File Templates. - */ -public class RouteUnitTest extends BaseTest { - - - @Test(dataProvider="isSuffixTestData") - public void testIsSuffix(final Route route, final Path path, final boolean expectedResult) { - Assert.assertEquals(route.isSuffix(path), expectedResult); - } - - @DataProvider(name="isSuffixTestData") - public Iterator isSuffixTestData() { - return TEST_DATA.iterator(); - } - - private static final int[] TEST_EDGE_PAIRS1 = new int[] { - 3 , 4, - 4 , 5, - 5, 7, - 7, 8, - 8, 9, - 4 , 6, - 6, 9, - 9, 11, - 11, 12, - }; - - - - private static final int[] TEST_EDGE_PAIRS = new int[] { - 1 , 2, - 2 , 3, - 3 , 4, - 4 , 5, - 5, 7, - 7, 8, - 8, 9, - 4 , 6, - 6, 9, - 9, 10, - 10, 11, - 11, 12, - 2, 5, - 5, 12, - - 3, 13, - 13, 14, - 14, 15 - }; - - public static final EdgeFactory TEST_GRAPH_EDGE_FACTORY = new EdgeFactory() { - @Override - public BaseEdge createEdge(final BaseVertex baseVertex, final BaseVertex baseVertex2) { - return new BaseEdge(false, 0); - } - }; - - - private static Map vertexByInteger = new HashMap<>(); - private static final BaseGraph TEST_GRAPH = new BaseGraph<>(1, TEST_GRAPH_EDGE_FACTORY); - private static final List TEST_DATA; - - - static { - for (int i = 0; i < TEST_EDGE_PAIRS.length; i += 2) { - final int sourceInteger = TEST_EDGE_PAIRS[i]; - final int targetInteger = TEST_EDGE_PAIRS[i + 1]; - final BaseVertex sourceVertex = resolveVertexByInteger(sourceInteger); - final BaseVertex targetVertex = resolveVertexByInteger(targetInteger); - TEST_GRAPH.addEdge(sourceVertex, targetVertex); - } - Assert.assertEquals(1,TEST_GRAPH.getSources().size()); - final Deque> pendingPaths = new LinkedList<>(); - final Deque> pendingRoutes = new LinkedList<>(); - final List> allPossiblePaths = new LinkedList<>(); - final List> allPossibleRoutes = new LinkedList<>(); - for (final BaseVertex vertex : TEST_GRAPH.vertexSet()) { - pendingPaths.add(new Path(vertex, TEST_GRAPH)); - pendingRoutes.add(new Route(vertex,TEST_GRAPH)); - } - while (!pendingPaths.isEmpty()) { // !pendingRoutes.isEmpty(); - final Path path = pendingPaths.remove(); - final Route route = pendingRoutes.remove(); - final BaseVertex lastVertex = path.getLastVertex(); - allPossiblePaths.add(path); - allPossibleRoutes.add(route); - - if (allPossiblePaths.size() % 100 == 0) - Reporter.log("" + allPossiblePaths.size(), true); - for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) - pendingPaths.add(new Path<>(path,edge)); - for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) - pendingRoutes.add(new Route<>(route,edge)); - } - - final int numberOfPaths = allPossiblePaths.size(); - final boolean[][] isSuffix = buildIsSuffixMatrix(allPossiblePaths, numberOfPaths); - TEST_DATA = createTestData(allPossiblePaths,allPossibleRoutes,isSuffix); - } - - private static boolean[][] buildIsSuffixMatrix(final List> allPossiblePaths, final int numberOfPaths) { - final boolean[][] isSuffix = new boolean[numberOfPaths][numberOfPaths]; - final ListIterator> iIterator = allPossiblePaths.listIterator(); - for (int i = 0; i < numberOfPaths; i++) { - isSuffix[i][i] = true; - final ListIterator> jIterator = allPossiblePaths.listIterator(i + 1); - final Path iPath = iIterator.next(); - for (int j = i + 1; j < numberOfPaths; j++) { - final Path jPath = jIterator.next(); - if (iPath.getLastVertex() != jPath.getLastVertex()) { - isSuffix[i][j] = isSuffix[j][i] = false; - } else { - isSuffix[i][j] = isSuffix[j][i] = true; // let assume they are suffix of each other by default. - final Path shortPath; - final Path longPath; - if (iPath.getEdges().size() <= jPath.getEdges().size()) { - shortPath = iPath; - longPath = jPath; - } else { - longPath = iPath; - shortPath = jPath; - } - final ListIterator longPathEdgesIterator = longPath.getEdges().listIterator(longPath.getEdges().size()); - final ListIterator shortPathEdgesIterator = shortPath.getEdges().listIterator(shortPath.getEdges().size()); - - while (shortPathEdgesIterator.hasPrevious()) { - final BaseEdge shortEdge = shortPathEdgesIterator.previous(); - final BaseEdge longEdge = longPathEdgesIterator.previous(); - if (shortEdge != longEdge) { - isSuffix[i][j] = isSuffix[j][i] = false; - break; - } - } - if (isSuffix[i][j]) { - if (longPathEdgesIterator.hasPrevious()) { - if (longPath == iPath) - isSuffix[j][i] = false; - else - isSuffix[i][j] = false; - } - } - } - - } - } - return isSuffix; - } - - private static List createTestData(final List> allPossiblePaths, final List> allPossibleRoutes, final boolean[][] isSuffix) { - final List result = new ArrayList<>(allPossiblePaths.size() * allPossiblePaths.size() * 2 ); - final Path[] allPaths = allPossiblePaths.toArray(new Path[allPossiblePaths.size()]); - final Route[] allRoutes = allPossibleRoutes.toArray(new Route[allPossibleRoutes.size()]); - final int numberOfPaths = allPaths.length; - for (int i = 0; i < numberOfPaths; i++) - for (int j = 0; j < numberOfPaths; j++) { - result.add(new Object[] { allRoutes[i], allPaths[j], isSuffix[i][j] }); - result.add(new Object[] { allRoutes[i], allRoutes[j], isSuffix[i][j] }); - result.add(new Object[] { allRoutes[i], inverseRebuild(allRoutes[j]), isSuffix[i][j]}); - } - - return result; - } - - private static Route inverseRebuild(final Route original) { - final ListIterator it = original.getEdges().listIterator(original.length()); - Route result = new Route<>(original.getLastVertex(),original.getGraph()); - while (it.hasPrevious()) { - result = new Route<>(it.previous(),result); - } - return result; - } - - private static BaseVertex resolveVertexByInteger(final int targetInteger) { - if (vertexByInteger.containsKey(targetInteger)) - return vertexByInteger.get(targetInteger); - else { - int value = targetInteger; - final StringBuffer stringBuffer = new StringBuffer(); - while (value > 0) { - int c = value % 4; - switch (c) { - case 0: stringBuffer.append('A'); break; - case 1: stringBuffer.append('C'); break; - case 2: stringBuffer.append('G'); break; - case 3: stringBuffer.append('T'); break; - } - value = value / 4; - } - if (stringBuffer.length() == 0) stringBuffer.append('A'); - final byte[] sequence = stringBuffer.reverse().toString().getBytes(); - final BaseVertex result = new BaseVertex(sequence); - vertexByInteger.put(targetInteger, result); - TEST_GRAPH.addVertex(result); - return result; - } - - } - - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java deleted file mode 100644 index bb504b78c..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ /dev/null @@ -1,294 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -public class SharedVertexSequenceSplitterUnitTest extends BaseTest { - private final static boolean PRINT_GRAPHS = false; - - @DataProvider(name = "PrefixSuffixData") - public Object[][] makePrefixSuffixData() { - List tests = new ArrayList(); - - tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0}); - tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0}); - tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1}); - tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1}); - tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0}); - tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0}); - tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0}); - tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1}); - tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4}); - - tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0}); - tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0}); - tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0}); - tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0}); - tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1}); - tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1}); - tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0}); - tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3}); - tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3}); - tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0}); - - tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "PrefixSuffixData") - public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) { - final List bytes = new ArrayList(); - int min = Integer.MAX_VALUE; - for ( final String s : strings ) { - bytes.add(s.getBytes()); - min = Math.min(min, s.length()); - } - - final int actualPrefixLen = GraphUtils.compPrefixLen(bytes, min); - Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); - - final int actualSuffixLen = GraphUtils.compSuffixLen(bytes, min - actualPrefixLen); - Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); - } - - @Test(dataProvider = "PrefixSuffixData") - public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) { - final List v = new ArrayList(); - for ( final String s : strings ) { - v.add(new SeqVertex(s)); - } - - final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); - final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); - - final Pair result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v); - Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test"); - Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test"); - - Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty()); - Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty()); - } - - @Test(dataProvider = "PrefixSuffixData") - public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { - final SeqGraph graph = new SeqGraph(11); - - final List v = new ArrayList(); - for ( final String s : strings ) { - v.add(new SeqVertex(s)); - } - - graph.addVertices(v.toArray(new SeqVertex[]{})); - - final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); - final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); - - final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); - splitter.split(); -// splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0); - - Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); - Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); - - Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size()); - Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0); - - Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size()); - Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0); - - for ( final SeqVertex mid : splitter.newMiddles ) { - Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid)); - Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV)); - } - } - - @DataProvider(name = "CompleteCycleData") - public Object[][] makeCompleteCycleData() { - List tests = new ArrayList(); - - for ( final boolean hasTop : Arrays.asList(true, false) ) { - for ( final boolean hasBot : Arrays.asList(true, false) ) { - if ( ! hasTop && ! hasBot ) continue; - tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot}); - - tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot}); - tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot}); - // all merged - tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot}); - // prefix and suffix - tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot}); - // 2 -> prefix, leave C - tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot}); - // 2 -> prefix, leave CCC - tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot}); - // 2 -> suffix, leave A/T - tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot}); - // 2 -> suffix, leave T, delete 1 - tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CompleteCycleData") - public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) { - final SeqGraph graph = new SeqGraph(11); - - int edgeWeight = 1; - final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; - final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; - final List v = new ArrayList(); - for ( final String s : strings ) { - v.add(new SeqVertex(s)); - } - graph.addVertices(v.toArray(new SeqVertex[]{})); - final SeqVertex first = v.get(0); - - if ( hasTop ) { - graph.addVertex(top); - for ( final SeqVertex vi : v ) - graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++)); - } - - if ( hasBot ) { - graph.addVertex(bot); - for ( final SeqVertex vi : v ) - graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); - } - - final Set haplotypes = new HashSet(); - final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); - for ( final Path path : originalPaths ) - haplotypes.add(new String(path.getBases())); - - final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); - splitter.split(); - if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0); - if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0); - splitter.updateGraph(top, bot); - if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); - - final List> splitPaths = new KBestPaths().getKBestPaths(graph); - for ( final Path path : splitPaths ) { - final String h = new String(path.getBases()); - Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); - } - - if ( splitPaths.size() == originalPaths.size() ) { - for ( int i = 0; i < originalPaths.size(); i++ ) { - Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); - } - } - } - - @DataProvider(name = "MeetsMinSequenceData") - public Object[][] makeMeetsMinSequenceData() { - List tests = new ArrayList(); - - final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging(); - tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true}); - tests.add(new Object[]{Arrays.asList("AC", "AC"), 1, prefixBiased, ! prefixBiased}); - tests.add(new Object[]{Arrays.asList("AC", "AC"), 2, prefixBiased, ! prefixBiased}); - tests.add(new Object[]{Arrays.asList("AC", "AC"), 3, false, false}); - tests.add(new Object[]{Arrays.asList("A", "AC"), 1, true, false}); - tests.add(new Object[]{Arrays.asList("A", "AC"), 2, false, false}); - tests.add(new Object[]{Arrays.asList("AT", "AC"), 1, true, false}); - tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 1, true, false}); - tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 2, true, false}); - tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 3, false, false}); - tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 1, true, true}); - tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 2, true, true}); - tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 3, false, true}); - tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 4, false, false}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MeetsMinSequenceData") - public void testSplitterCompleteCycle(final List mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) { - final SeqGraph graph = new SeqGraph(11); - - final SeqVertex top = new SeqVertex("AAAAAAAA"); - final SeqVertex bot = new SeqVertex("GGGGGGGG"); - final List v = new ArrayList(); - for ( final String s : mids ) { v.add(new SeqVertex(s)); } - graph.addVertices(v.toArray(new SeqVertex[]{})); - graph.addVertices(top, bot); - for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); } - - final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); - Assert.assertEquals(splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed"); - Assert.assertEquals(splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed"); - Assert.assertEquals(splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java deleted file mode 100644 index 9172b6454..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -public class ReadThreadingAssemblerUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - private static class TestAssembler { - final ReadThreadingAssembler assembler; - - Haplotype refHaplotype; - final List reads = new LinkedList(); - - private TestAssembler(final int kmerSize) { - this.assembler = new ReadThreadingAssembler(100000, Arrays.asList(kmerSize)); - assembler.setJustReturnRawGraph(true); - assembler.setPruneFactor(0); - } - - public void addSequence(final byte[] bases, final boolean isRef) { - if ( isRef ) { - refHaplotype = new Haplotype(bases, true); - } else { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte)30,bases.length), bases.length + "M"); - reads.add(read); - } - } - - public SeqGraph assemble() { - assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests - assembler.setRecoverDanglingTails(false); // needed to pass some of the tests - assembler.setDebugGraphTransformations(true); - final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList()).get(0).getGraph(); - if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); - return graph; - } - } - - private void assertLinearGraph(final TestAssembler assembler, final String seq) { - final SeqGraph graph = assembler.assemble(); - graph.simplifyGraph(); - Assert.assertEquals(graph.vertexSet().size(), 1); - Assert.assertEquals(graph.vertexSet().iterator().next().getSequenceString(), seq); - } - - private void assertSingleBubble(final TestAssembler assembler, final String one, final String two) { - final SeqGraph graph = assembler.assemble(); - graph.simplifyGraph(); - List> paths = new KBestPaths().getKBestPaths(graph); - Assert.assertEquals(paths.size(), 2); - final Set expected = new HashSet(Arrays.asList(one, two)); - for ( final Path path : paths ) { - final String seq = new String(path.getBases()); - Assert.assertTrue(expected.contains(seq)); - expected.remove(seq); - } - } - - @Test(enabled = ! DEBUG) - public void testRefCreation() { - final String ref = "ACGTAACCGGTT"; - final TestAssembler assembler = new TestAssembler(3); - assembler.addSequence(ref.getBytes(), true); - assertLinearGraph(assembler, ref); - } - - @Test(enabled = ! DEBUG) - public void testRefNonUniqueCreation() { - final String ref = "GAAAAT"; - final TestAssembler assembler = new TestAssembler(3); - assembler.addSequence(ref.getBytes(), true); - assertLinearGraph(assembler, ref); - } - - @Test(enabled = ! DEBUG) - public void testRefAltCreation() { - final TestAssembler assembler = new TestAssembler(3); - final String ref = "ACAACTGA"; - final String alt = "ACAGCTGA"; - assembler.addSequence(ref.getBytes(), true); - assembler.addSequence(alt.getBytes(), false); - assertSingleBubble(assembler, ref, alt); - } - - @Test(enabled = ! DEBUG) - public void testPartialReadsCreation() { - final TestAssembler assembler = new TestAssembler(3); - final String ref = "ACAACTGA"; - final String alt1 = "ACAGCT"; - final String alt2 = "GCTGA"; - assembler.addSequence(ref.getBytes(), true); - assembler.addSequence(alt1.getBytes(), false); - assembler.addSequence(alt2.getBytes(), false); - assertSingleBubble(assembler, ref, "ACAGCTGA"); - } - - @Test(enabled = ! DEBUG) - public void testStartInMiddle() { - final TestAssembler assembler = new TestAssembler(3); - final String ref = "CAAAATG"; - final String read = "AAATG"; - assembler.addSequence(ref.getBytes(), true); - assembler.addSequence(read.getBytes(), false); - assertLinearGraph(assembler, ref); - } - - @Test(enabled = ! DEBUG) - public void testStartInMiddleWithBubble() { - final TestAssembler assembler = new TestAssembler(3); - final String ref = "CAAAATGGGG"; - final String read = "AAATCGGG"; - assembler.addSequence(ref.getBytes(), true); - assembler.addSequence(read.getBytes(), false); - assertSingleBubble(assembler, ref, "CAAAATCGGG"); - } - - @Test(enabled = ! DEBUG) - public void testNoGoodStarts() { - final TestAssembler assembler = new TestAssembler(3); - final String ref = "CAAAATGGGG"; - final String read = "AAATCGGG"; - assembler.addSequence(ref.getBytes(), true); - assembler.addSequence(read.getBytes(), false); - assertSingleBubble(assembler, ref, "CAAAATCGGG"); - } - - - @Test(enabled = !DEBUG) - public void testCreateWithBasesBeforeRefSource() { - final TestAssembler assembler = new TestAssembler(3); - final String ref = "ACTG"; - final String read = "CTGGGACT"; - assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true); - assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read), false); - assertLinearGraph(assembler, "ACTGGGACT"); - } - - @Test(enabled = !DEBUG) - public void testSingleIndelAsDoubleIndel3Reads() { - final TestAssembler assembler = new TestAssembler(25); - // The single indel spans two repetitive structures - final String ref = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCTCTCTGTGTGTGTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; - final String read1 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; - final String read2 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; - assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true); - assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read1), false); - assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read2), false); - - final SeqGraph graph = assembler.assemble(); - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph); - Assert.assertEquals(paths.size(), 2); - final byte[] refPath = paths.get(0).getBases().length == ref.length() ? paths.get(0).getBases() : paths.get(1).getBases(); - final byte[] altPath = paths.get(0).getBases().length == ref.length() ? paths.get(1).getBases() : paths.get(0).getBases(); - Assert.assertEquals(refPath, ReadThreadingGraphUnitTest.getBytes(ref)); - Assert.assertEquals(altPath, ReadThreadingGraphUnitTest.getBytes(read1)); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java deleted file mode 100644 index 0d9c07251..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ /dev/null @@ -1,344 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -public class ReadThreadingGraphUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - public static byte[] getBytes(final String alignment) { - return alignment.replace("-","").getBytes(); - } - - private void assertNonUniques(final ReadThreadingGraph assembler, String ... nonUniques) { - final Set actual = new HashSet<>(); - assembler.buildGraphIfNecessary(); - for ( final Kmer kmer : assembler.getNonUniqueKmers() ) actual.add(kmer.baseString()); - final Set expected = new HashSet<>(Arrays.asList(nonUniques)); - Assert.assertEquals(actual, expected); - } - - @Test - public void testSimpleHaplotypeRethreading() { - final ReadThreadingGraph assembler = new ReadThreadingGraph(11); - final String ref = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG"; - final String alt = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG"; - assembler.addSequence("anonymous", getBytes(ref), null, true); - assembler.addSequence("anonymous", getBytes(alt), null, false); - assembler.buildGraphIfNecessary(); - Assert.assertNotEquals(ref.length() - 11 + 1,assembler.vertexSet().size(),"the number of vertex in the graph is the same as if there was no alternative sequence"); - Assert.assertEquals(ref.length() - 11 + 1 + 11,assembler.vertexSet().size(),"the number of vertex in the graph is not the same as if there is an alternative sequence"); - MultiDeBruijnVertex startAlt = assembler.findKmer(new Kmer(alt.getBytes(),20,11)); - Assert.assertNotNull(startAlt); - } - - @Test(enabled = ! DEBUG) - public void testNonUniqueMiddle() { - final ReadThreadingGraph assembler = new ReadThreadingGraph(3); - final String ref = "GACACACAGTCA"; - final String read1 = "GACAC---GTCA"; - final String read2 = "CAC---GTCA"; - assembler.addSequence(getBytes(ref), true); - assembler.addSequence(getBytes(read1), false); - assembler.addSequence(getBytes(read2), false); - assertNonUniques(assembler, "ACA", "CAC"); - } - - @Test(enabled = ! DEBUG) - public void testReadsCreateNonUnique() { - final ReadThreadingGraph assembler = new ReadThreadingGraph(3); - final String ref = "GCAC--GTCA"; // CAC is unique - final String read1 = "GCACACGTCA"; // makes CAC non unique because it has a duplication - final String read2 = "CACGTCA"; // shouldn't be allowed to match CAC as start - assembler.addSequence(getBytes(ref), true); - assembler.addSequence(getBytes(read1), false); - assembler.addSequence(getBytes(read2), false); -// assembler.convertToSequenceGraph().printGraph(new File("test.dot"), 0); - - assertNonUniques(assembler, "CAC"); - //assertSingleBubble(assembler, ref, "CAAAATCGGG"); - } - - @Test(enabled = ! DEBUG) - public void testCountingOfStartEdges() { - final ReadThreadingGraph assembler = new ReadThreadingGraph(3); - final String ref = "NNNGTCAAA"; // ref has some bases before start - final String read1 = "GTCAAA"; // starts at first non N base - - assembler.addSequence(getBytes(ref), true); - assembler.addSequence(getBytes(read1), false); - assembler.buildGraphIfNecessary(); -// assembler.printGraph(new File("test.dot"), 0); - - for ( final MultiSampleEdge edge : assembler.edgeSet() ) { - final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); - final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); - final boolean headerVertex = source.getSuffix() == 'N' || target.getSuffix() == 'N'; - if ( headerVertex ) { - Assert.assertEquals(edge.getMultiplicity(), 1, "Bases in the unique reference header should have multiplicity of 1"); - } else { - Assert.assertEquals(edge.getMultiplicity(), 2, "Should have multiplicity of 2 for any edge outside the ref header but got " + edge + " " + source + " -> " + target); - } - } - } - - @Test(enabled = !DEBUG) - public void testCountingOfStartEdgesWithMultiplePrefixes() { - final ReadThreadingGraph assembler = new ReadThreadingGraph(3); - assembler.increaseCountsThroughBranches = true; - final String ref = "NNNGTCAXX"; // ref has some bases before start - final String alt1 = "NNNCTCAXX"; // alt1 has SNP right after N - final String read = "TCAXX"; // starts right after SNP, but merges right before branch - - assembler.addSequence(getBytes(ref), true); - assembler.addSequence(getBytes(alt1), false); - assembler.addSequence(getBytes(read), false); - assembler.buildGraphIfNecessary(); - assembler.printGraph(new File("test.dot"), 0); - - final List oneCountVertices = Arrays.asList("NNN", "NNG", "NNC", "NGT", "NCT"); - final List threeCountVertices = Arrays.asList("CAX", "AXX"); - - for ( final MultiSampleEdge edge : assembler.edgeSet() ) { - final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); - final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); - final int expected = oneCountVertices.contains(target.getSequenceString()) ? 1 : (threeCountVertices.contains(target.getSequenceString()) ? 3 : 2); - Assert.assertEquals(edge.getMultiplicity(), expected, "Bases at edge " + edge + " from " + source + " to " + target + " has bad multiplicity"); - } - } - - @Test(enabled = !DEBUG) - public void testCyclesInGraph() { - - // b37 20:12655200-12655850 - final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; - - // SNP at 20:12655528 creates a cycle for small kmers - final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; - - final List reads = new ArrayList<>(); - for ( int index = 0; index < alt.length() - 100; index += 20 ) - reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); - - // test that there are cycles detected for small kmer - final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25); - rtgraph25.addSequence("ref", ref.getBytes(), null, true); - for ( final GATKSAMRecord read : reads ) - rtgraph25.addRead(read); - rtgraph25.buildGraphIfNecessary(); - Assert.assertTrue(rtgraph25.hasCycles()); - - // test that there are no cycles detected for large kmer - final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75); - rtgraph75.addSequence("ref", ref.getBytes(), null, true); - for ( final GATKSAMRecord read : reads ) - rtgraph75.addRead(read); - rtgraph75.buildGraphIfNecessary(); - Assert.assertFalse(rtgraph75.hasCycles()); - } - - @Test(enabled = !DEBUG) - public void testNsInReadsAreNotUsedForGraph() { - - final int length = 100; - final byte[] ref = Utils.dupBytes((byte)'A', length); - - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25); - rtgraph.addSequence("ref", ref, null, true); - - // add reads with Ns at any position - for ( int i = 0; i < length; i++ ) { - final byte[] bases = ref.clone(); - bases[i] = 'N'; - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M"); - rtgraph.addRead(read); - } - rtgraph.buildGraphIfNecessary(); - - final SeqGraph graph = rtgraph.convertToSequenceGraph(); - final KBestPaths pathFinder = new KBestPaths<>(false); - Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); - } - - @DataProvider(name = "DanglingTails") - public Object[][] makeDanglingTailsData() { - List tests = new ArrayList(); - - // add 1M to the expected CIGAR because it includes the previous (common) base too - tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype - tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion - tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion - tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp - tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps - tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment - tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data - tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch - tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "DanglingTails", enabled = !DEBUG) - public void testDanglingTails(final String refEnd, - final String altEnd, - final String cigar, - final boolean cigarIsGood, - final int mergePointDistanceFromSink) { - - final int kmerSize = 15; - - // construct the haplotypes - final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; - final String ref = commonPrefix + refEnd; - final String alt = commonPrefix + altEnd; - - // create the graph and populate it - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); - rtgraph.addSequence("ref", ref.getBytes(), null, true); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); - rtgraph.addRead(read); - rtgraph.buildGraphIfNecessary(); - - // confirm that we have just a single dangling tail - MultiDeBruijnVertex altSink = null; - for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { - if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { - Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); - altSink = v; - } - } - - Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); - - // confirm that the SW alignment agrees with our expectations - final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink, 0); - - if ( result == null ) { - Assert.assertFalse(cigarIsGood); - return; - } - - Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); - - // confirm that the goodness of the cigar agrees with our expectations - Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood); - - // confirm that the tail merging works as expected - if ( cigarIsGood ) { - final int mergeResult = rtgraph.mergeDanglingTail(result); - Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); - - // confirm that we created the appropriate edge - if ( mergePointDistanceFromSink >= 0 ) { - MultiDeBruijnVertex v = altSink; - for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { - if ( rtgraph.inDegreeOf(v) != 1 ) - Assert.fail("Encountered vertex with multiple sources"); - v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); - } - Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); - } - } - } - - -// TODO -- update to use determineKmerSizeAndNonUniques directly -// @DataProvider(name = "KmerSizeData") -// public Object[][] makeKmerSizeDataProvider() { -// List tests = new ArrayList(); -// -// // this functionality can be adapted to provide input data for whatever you might want in your data -// tests.add(new Object[]{3, 3, 3, Arrays.asList("ACG"), Arrays.asList()}); -// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAGACG"), Arrays.asList()}); -// -// tests.add(new Object[]{3, 3, 3, Arrays.asList("AAAAC"), Arrays.asList("AAA")}); -// tests.add(new Object[]{3, 4, 4, Arrays.asList("AAAAC"), Arrays.asList()}); -// tests.add(new Object[]{3, 5, 4, Arrays.asList("AAAAC"), Arrays.asList()}); -// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAAA"), Arrays.asList()}); -// tests.add(new Object[]{3, 4, 4, Arrays.asList("CAAAA"), Arrays.asList()}); -// tests.add(new Object[]{3, 5, 4, Arrays.asList("CAAAA"), Arrays.asList()}); -// tests.add(new Object[]{3, 5, 5, Arrays.asList("ACGAAAAACG"), Arrays.asList()}); -// -// for ( int maxSize = 3; maxSize < 20; maxSize++ ) { -// for ( int dupSize = 3; dupSize < 20; dupSize++ ) { -// final int expectedSize = Math.min(maxSize, dupSize); -// final String dup = Utils.dupString("C", dupSize); -// final List nonUnique = dupSize > maxSize ? Arrays.asList(Utils.dupString("C", maxSize)) : Collections.emptyList(); -// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("ACGT", "A" + dup + "GT"), nonUnique}); -// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("A" + dup + "GT", "ACGT"), nonUnique}); -// } -// } -// -// return tests.toArray(new Object[][]{}); -// } -// -// /** -// * Example testng test using MyDataProvider -// */ -// @Test(dataProvider = "KmerSizeData") -// public void testDynamicKmerSizing(final int min, final int max, final int expectKmer, final List seqs, final List expectedNonUniques) { -// final ReadThreadingGraph assembler = new ReadThreadingGraph(min, max); -// for ( String seq : seqs ) assembler.addSequence(seq.getBytes(), false); -// assembler.buildGraphIfNecessary(); -// Assert.assertEquals(assembler.getKmerSize(), expectKmer); -// assertNonUniques(assembler, expectedNonUniques.toArray(new String[]{})); -// } - - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java deleted file mode 100644 index 7c3160c30..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -public class SequenceForKmersUnitTest extends BaseTest { - @Test - public void testNoCount() { - final byte[] seq = "ACGT".getBytes(); - final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, null, true); - Assert.assertEquals(sk.name, "foo"); - Assert.assertEquals(sk.sequence, seq); - Assert.assertEquals(sk.start, 0); - Assert.assertEquals(sk.stop, seq.length); - Assert.assertEquals(sk.isRef, true); - for ( int i = 0; i < seq.length; i++ ) - Assert.assertEquals(sk.getCount(i), 1); - } - - @Test - public void testWithCounts() { - final int len = 256; - final int[] counts = new int[len]; - for ( int i = 0; i < len; i++ ) counts[i] = i; - final byte[] seq = Utils.dupBytes((byte)'A', len); - - final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, counts, true); - - for ( int i = 0; i < seq.length; i++ ) - Assert.assertEquals(sk.getCount(i), i); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java deleted file mode 100644 index 38c06c25f..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java +++ /dev/null @@ -1,133 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.indels; - - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -public class PairHMMIndelErrorModelUnitTest extends BaseTest { - - private SAMFileHeader header; - - @BeforeClass - public void setup() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - } - - private static final int refWindowStart = 1000; - private static final int refWindowEnd = 1100; - - @DataProvider(name = "ClipUpstreamProvider") - public Object[][] ClipUpstreamTestData() { - List tests = new ArrayList(); - - for ( final int readStart : Arrays.asList(900, 950, 990, 1000) ) { - for ( final int readLength : Arrays.asList(10, 50, 100) ) { - for ( final int delLength : Arrays.asList(0, 5, 10) ) { - tests.add(new Object[]{readStart, readLength, delLength}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ClipUpstreamProvider", enabled = true) - public void clipUpstreamTest(final int readStart, final int readLength, final int delLength) { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); - if ( delLength == 0 ) - read.setCigarString(readLength + "M"); - else - read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); - - final boolean result = PairHMMIndelErrorModel.mustClipUpstream(read, refWindowStart); - Assert.assertEquals(result, read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart); - } - - @DataProvider(name = "ClipDownstreamProvider") - public Object[][] ClipDownstreamTestData() { - List tests = new ArrayList(); - - for ( final int readStart : Arrays.asList(1000, 1050, 1090, 1100) ) { - for ( final int readLength : Arrays.asList(10, 50, 100) ) { - for ( final int delLength : Arrays.asList(0, 5, 10) ) { - tests.add(new Object[]{readStart, readLength, delLength}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ClipDownstreamProvider", enabled = true) - public void clipDownstreamTest(final int readStart, final int readLength, final int delLength) { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); - if ( delLength == 0 ) - read.setCigarString(readLength + "M"); - else - read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); - - final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd); - Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd); - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java deleted file mode 100644 index 9759004a0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ /dev/null @@ -1,149 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class ReadBackedPhasingIntegrationTest extends WalkerTest { - - public static String baseTestString(String reference, String reads, String VCF, int cacheWindowSize, int maxPhaseSites, double phaseQualityThresh) { - return "-T ReadBackedPhasing" + - " -R " + reference + - " -I " + validationDataLocation + reads + - " --variant " + ( VCF.contains("phasing_test") ? privateTestDir : validationDataLocation) + VCF + - " --cacheWindowSize " + cacheWindowSize + - " --maxPhaseSites " + maxPhaseSites + - " --phaseQualityThresh " + phaseQualityThresh + - " -o %s" + - " --no_cmdline_in_header"; - } - - - @Test - public void test1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) - + " -L chr20:332341-382503", - 1, - Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c")); - executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); - } - - @Test - public void test2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) - + " -L chr20:1232503-1332503", - 1, - Arrays.asList("a3ca151145379e0d4bae64a91165ea0b")); - executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); - } - - @Test - public void test3() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) - + " -L chr20:332341-382503", - 1, - Arrays.asList("f685803333123a102ce1851d984cbd10")); - executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); - } - - @Test - public void test4() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) - + " -L chr20:332341-382503", - 1, - Arrays.asList("aaa7c25d118383639f273128d241e140")); - executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); - } - - @Test - public void test5() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) - + " -L chr20:332341-482503", - 1, - Arrays.asList("418e29400762972e77bae4f73e16befe")); - executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); - } - - @Test - public void test6() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) - + " -L chr20:652810-681757", - 1, - Arrays.asList("4c8f6190ecc86766baba3aba08542991")); - executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); - } - - @Test - public void test7() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) - + " -L chr20:332341-802503", - 1, - Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34")); - executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec); - } - - @Test - public void test8() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) - + " -L chr20:332341-802503" + " -respectPhaseInInput", - 1, - Arrays.asList("e3549b89d49092e73cc6eb21f233471c")); - executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec); - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java deleted file mode 100644 index 754fe30a2..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java +++ /dev/null @@ -1,145 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.BaseTest; -import org.junit.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 7/25/13 - */ - -public class VariantDataManagerUnitTest extends BaseTest { - - @Test - public final void testCalculateSortOrder() { - final double passingQual = 400.0; - final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); - - VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); - - final List theData = new ArrayList<>(); - final VariantDatum datum1 = new VariantDatum(); - datum1.atTrainingSite = true; - datum1.failingSTDThreshold = false; - datum1.originalQual = passingQual; - datum1.annotations = new double[]{0.0,-10.0,10.0}; - datum1.isNull = new boolean[]{false, false, false}; - theData.add(datum1); - - final VariantDatum datum2 = new VariantDatum(); - datum2.atTrainingSite = true; - datum2.failingSTDThreshold = false; - datum2.originalQual = passingQual; - datum2.annotations = new double[]{0.0,-9.0,15.0}; - datum2.isNull = new boolean[]{false, false, false}; - theData.add(datum2); - - final VariantDatum datum3 = new VariantDatum(); - datum3.atTrainingSite = false; - datum3.failingSTDThreshold = false; - datum3.originalQual = passingQual; - datum3.annotations = new double[]{0.0,1.0,999.0}; - datum3.isNull = new boolean[]{false, false, false}; - theData.add(datum3); - - final VariantDatum datum4 = new VariantDatum(); - datum4.atTrainingSite = false; - datum4.failingSTDThreshold = false; - datum4.originalQual = passingQual; - datum4.annotations = new double[]{0.015,2.0,1001.11}; - datum4.isNull = new boolean[]{false, false, false}; - theData.add(datum4); - - vdm.setData(theData); - - final double[] meanVector = new double[3]; - for( int iii = 0; iii < meanVector.length; iii++ ) { - meanVector[iii] = vdm.mean(iii, true); - } - final List order = vdm.calculateSortOrder(meanVector); - Assert.assertArrayEquals(new int[]{2,1,0}, ArrayUtils.toPrimitive(order.toArray(new Integer[order.size()]))); - } - - @Test - public final void testDownSamplingTrainingData() { - final int MAX_NUM_TRAINING_DATA = 5000; - final double passingQual = 400.0; - final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); - VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA; - - VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); - final List theData = new ArrayList<>(); - for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) { - final VariantDatum datum = new VariantDatum(); - datum.atTrainingSite = true; - datum.failingSTDThreshold = false; - datum.originalQual = passingQual; - theData.add(datum); - } - - for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) { - final VariantDatum datum = new VariantDatum(); - datum.atTrainingSite = false; - datum.failingSTDThreshold = false; - datum.originalQual = passingQual; - theData.add(datum); - } - - vdm.setData(theData); - final List trainingData = vdm.getTrainingData(); - - Assert.assertTrue( trainingData.size() == MAX_NUM_TRAINING_DATA ); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java deleted file mode 100644 index f3e57b48a..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ /dev/null @@ -1,273 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.List; - -public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { - private static class VRTest { - String inVCF; - String tranchesMD5; - String recalMD5; - String cutVCFMD5; - public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { - this.inVCF = inVCF; - this.tranchesMD5 = tranchesMD5; - this.recalMD5 = recalMD5; - this.cutVCFMD5 = cutVCFMD5; - } - - @Override - public String toString() { - return "VRTest{inVCF='" + inVCF +"'}"; - } - } - - VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", - "6f029dc7d16e63e19c006613cd0a5cff", // tranches - "73c7897441622c9b37376eb4f071c560", // recal file - "11a28df79b92229bd317ac49a3ed0fa1"); // cut VCF - - @DataProvider(name = "VRTest") - public Object[][] createData1() { - return new Object[][]{ {lowPass} }; - //return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here - } - - @Test(dataProvider = "VRTest") - public void testVariantRecalibrator(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -L 20:1,000,000-40,000,000" + - " --no_cmdline_in_header" + - " -an QD -an HaplotypeScore -an HRun" + - " --trustAllPolymorphic" + // for speed - " -recalFile %s" + - " -tranchesFile %s", - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRTest",dependsOnMethods="testVariantRecalibrator") - public void testApplyRecalibration(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:12,000,000-30,000,000" + - " --no_cmdline_in_header" + - " -input " + params.inVCF + - " -U LENIENT_VCF_PROCESSING -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles - executeTest("testApplyRecalibration-"+params.inVCF, spec); - } - - VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", - "3ad7f55fb3b072f373cbce0b32b66df4", // tranches - "e747c08131d58d9a4800720f6ca80e0c", // recal file - "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF - - @DataProvider(name = "VRBCFTest") - public Object[][] createVRBCFTest() { - return new Object[][]{ {bcfTest} }; - //return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here - } - - @Test(dataProvider = "VRBCFTest") - public void testVariantRecalibratorWithBCF(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -L 20:10,000,000-20,000,000" + - " --no_cmdline_in_header" + - " -an AC " + // integer value - " -an QD -an ReadPosRankSum -an FS -an InbreedingCoeff " + // floats value - " -mG 2 "+ - " -recalFile %s" + - " -tranchesFile %s", - 2, - Arrays.asList("bcf", "txt"), - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRBCFTest", dependsOnMethods="testVariantRecalibratorWithBCF") - public void testApplyRecalibrationWithBCF(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:10,000,000-20,000,000" + - " --no_cmdline_in_header" + - " -input " + params.inVCF + - " -U LENIENT_VCF_PROCESSING -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); - executeTest("testApplyRecalibration-"+params.inVCF, spec); - } - - - VRTest indelUnfiltered = new VRTest( - validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . - "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF - - VRTest indelFiltered = new VRTest( - validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS - "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF - - @DataProvider(name = "VRIndelTest") - public Object[][] createTestVariantRecalibratorIndel() { - return new Object[][]{ {indelUnfiltered}, {indelFiltered} }; - } - - @Test(dataProvider = "VRIndelTest") - public void testVariantRecalibratorIndel(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -L 20:1,000,000-40,000,000" + - " --no_cmdline_in_header" + - " -an QD -an ReadPosRankSum -an HaplotypeScore" + - " -mode INDEL -mG 3" + - " --trustAllPolymorphic" + // for speed - " -recalFile %s" + - " -tranchesFile %s", - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") - public void testApplyRecalibrationIndel(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:12,000,000-30,000,000" + - " -mode INDEL" + - " -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" + - " -input " + params.inVCF + - " -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation - executeTest("testApplyRecalibrationIndel-" + params.inVCF, spec); - } - - @Test - public void testApplyRecalibrationSnpAndIndelTogether() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:1000100-1000500" + - " -mode BOTH" + - " --no_cmdline_in_header" + - " -input " + privateTestDir + "VQSR.mixedTest.input" + - " -o %s" + - " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + - " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); - executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); - } - - @Test(enabled = true) - public void testApplyRecalibrationSnpAndIndelTogetherExcludeFiltered() throws Exception { - final String base = "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:1000100-1000500" + - " -mode BOTH" + - " --excludeFiltered -ts_filter_level 90.0" + - " --no_cmdline_in_header" + - " -input " + privateTestDir + "VQSR.mixedTest.input" + - " -o %s" + - " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + - " -recalFile " + privateTestDir + "VQSR.mixedTest.recal"; - - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - spec.disableShadowBCF(); - final File VCF = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).first.get(0); - - for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { - if( VC != null ) { - Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file - } - } - } -} - diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java deleted file mode 100644 index 2eeb9221e..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ /dev/null @@ -1,220 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -/** - * Tests CombineVariants - */ -public class CombineVariantsIntegrationTest extends WalkerTest { - // - // TODO TODO TODO TODO TODO TODO TODO TODO - // TODO TODO TODO TODO TODO TODO TODO TODO - // - // TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE lenientVCFProcessing ARGUMENTS - // - // TODO TODO TODO TODO TODO TODO TODO TODO - // TODO TODO TODO TODO TODO TODO TODO TODO - // TODO TODO TODO TODO TODO TODO TODO TODO - // - private static String baseTestString(String args) { - return baseTestString(args, b36KGReference); - } - - private static String baseTestString(String args, String ref) { - return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; - //return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; - } - - private void cvExecuteTest(final String name, final WalkerTestSpec spec, final boolean parallel) { - spec.disableShadowBCF(); - if ( parallel ) - executeTestParallel(name, spec); - else - executeTest(name, spec); - } - - public void test1InOut(String file, String md5) { - test1InOut(file, md5, ""); - } - - public void test1InOut(String file, String md5, String args) { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -priority v1 -V:v1 " + validationDataLocation + file + args), - 1, - Arrays.asList(md5)); - cvExecuteTest("testInOut1--" + file, spec, true); - } - - public void combine2(String file1, String file2, String args, String md5) { - combine2(file1, file2, args, md5, true); - } - - public void combine2(String file1, String file2, String args, String md5, final boolean parallel) { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -priority v1,v2 -V:v1 " + validationDataLocation + file1 + " -V:v2 "+ validationDataLocation + file2 + args), - 1, - Arrays.asList(md5)); - cvExecuteTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec, parallel); - } - - public void combineSites(String args, String md5) { - String file1 = "1000G_omni2.5.b37.sites.vcf"; - String file2 = "hapmap_3.3.b37.sites.vcf"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T CombineVariants --no_cmdline_in_header -o %s -R " + b37KGReference - + " -L 1:1-10,000,000 -V:omni " + validationDataLocation + file1 - + " -V:hm3 " + validationDataLocation + file2 + args, - 1, - Arrays.asList(md5)); - cvExecuteTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec, true); - } - - public void combinePLs(String file1, String file2, String md5) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T CombineVariants --no_cmdline_in_header -o %s -R " + b36KGReference + " -priority v1,v2 -V:v1 " + privateTestDir + file1 + " -V:v2 " + privateTestDir + file2, - 1, - Arrays.asList(md5)); - cvExecuteTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec, true); - } - - @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "6469fce8a5cd5a0f77e5ac5d9e9e192b", " -U LENIENT_VCF_PROCESSING"); } - @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "a4cedaa83d54e34cafc3ac4b80acf5b4", " -setKey foo -U LENIENT_VCF_PROCESSING"); } - @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "ac58a5fde17661e2a19004ca954d9781", " -setKey null -U LENIENT_VCF_PROCESSING"); } - @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "67a8076e30b4bca0ea5acdc9cd26a4e0"); } // official project VCF files in tabix format - - @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "909c6dc74eeb5ab86f8e74073eb0c1d6"); } - @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "381875b3280ba56eef0152e56f64f68d"); } - - @Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "f0ce3fb83d4ad9ba402d7cb11cd000c3"); } - - @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "4efdf983918db822e4ac13d911509576"); } // official project VCF files in tabix format - @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "848d4408ee953053d2307cefebc6bd6d"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "629656bfef7713c23f3a593523503b2f"); } - - @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e54d0dcf14f90d5c8e58b45191dd0219"); } - - @Test public void uniqueSNPs() { - // parallelism must be disabled because the input VCF is malformed (DB=0) and parallelism actually fixes this which breaks the md5s - combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "e5ea6ac3905bd9eeea1a2ef5d2cb5af7", true); - } - - @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "def52bcd3942bbe39cd7ebe845c4f206"); } - @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "5f61145949180bf2a0cd342d8e064860"); } - - @Test public void threeWayWithRefs() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -V:NA19240_BGI "+validationDataLocation+"NA19240.BGI.RG.vcf" + - " -V:NA19240_ILLUMINA "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + - " -V:NA19240_WUGSC "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + - " -V:denovoInfo "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + - " -setKey centerSet" + - " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + - " -U LENIENT_VCF_PROCESSING" + - " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + - " -genotypeMergeOptions UNIQUIFY -L 1"), - 1, - Arrays.asList("58e6281df108c361e99673a501ee4749")); - cvExecuteTest("threeWayWithRefs", spec, true); - } - - // complex examples with filtering, indels, and multiple alleles - public void combineComplexSites(String args, String md5) { - String file1 = "combine.1.vcf"; - String file2 = "combine.2.vcf"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T CombineVariants --no_cmdline_in_header -o %s -R " + b37KGReference - + " -V:one " + privateTestDir + file1 - + " -V:two " + privateTestDir + file2 + args, - 1, - Arrays.asList(md5)); - cvExecuteTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec, true); - } - - @Test public void complexTestFull() { combineComplexSites("", "9d989053826ffe5bef7c4e05ac51bcca"); } - @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "4f38d9fd30a7ae83e2a7dec265a28772"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } - @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } - - @Test public void combineSingleSamplePipelineGVCF() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + - " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + - " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + - " -multipleAllelesMergeType MIX_TYPES" + - " --excludeNonVariants -combineAnnotations -setKey null" + - " -L 20:10,000,000-10,001,000", b37KGReference), - 1, - Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); - cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); - } - - @Test - public void combineDBSNPDuplicateSites() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T CombineVariants --no_cmdline_in_header -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132, - 1, - Arrays.asList("aa926eae333208dc1f41fe69dc95d7a6")); - cvExecuteTest("combineDBSNPDuplicateSites:", spec, true); - } - - @Test - public void combineLeavesUnfilteredRecordsUnfiltered() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T CombineVariants --no_cmdline_in_header -o %s " - + " -R " + b37KGReference - + " -V " + privateTestDir + "combineVariantsLeavesRecordsUnfiltered.vcf", - 1, - Arrays.asList("f8c014d0af7e014475a2a448dc1f9cef")); - cvExecuteTest("combineLeavesUnfilteredRecordsUnfiltered: ", spec, false); - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java deleted file mode 100755 index 50c896450..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ /dev/null @@ -1,740 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import net.sf.picard.reference.ReferenceSequenceFile; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.StringBufferInputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class ConcordanceMetricsUnitTest extends BaseTest { - - private static ReferenceSequenceFile seq; - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() throws FileNotFoundException { - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq); - } - public static String HEADER_BASE = "##fileformat=VCFv4.0\n" + - "##filedate=2010-06-21\n"+ - "##reference=NCBI36\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##FILTER=\n"+ - "##FORMAT=\n"+ - "##FORMAT=\n"+ - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"; - public static String TEST_1_HEADER = HEADER_BASE + "test1_sample1\ttest1_sample2\ttest1_sample3\n"; - public static String TEST_2_HEADER = HEADER_BASE + "test2_sample1\ttest2_sample2\n"; - public static String TEST_3_HEADER_1 = HEADER_BASE + "test3_sample1\ttest3_sample2\ttest3_sample3\ttest3_sample4\ttest3_sample5\n"; - public static String TEST_3_HEADER_2 = HEADER_BASE + "test3_sample6\ttest3_sample7\ttest3_sample8\ttest3_sample9\ttest3_sample10\n"; - public static String TEST_3_HEADER_3 = HEADER_BASE + "test3_sample3\ttest3_sample6\ttest3_sample7\ttest3_sample8\ttest3_sample9\ttest3_sample10\n"; - - - private Pair getData1() { - - Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); - Allele alt_C = Allele.create(BaseUtils.Base.C.base); - - Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); - Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); - - Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(alt_C,alt_C)); - - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); - VariantContextBuilder eval_1_builder = new VariantContextBuilder(); - VariantContextBuilder truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - - Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - return testData; - } - - @Test(enabled=true) - public void testSimpleComparison() { - Pair data = getData1(); - VariantContext eval = data.getFirst(); - VariantContext truth = data.getSecond(); - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - metrics.update(eval,truth); - Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); - Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],1); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); - } - - private Pair getData2() { - - Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); - Allele alt_C = Allele.create(BaseUtils.Base.C.base); - Allele alt_T = Allele.create(BaseUtils.Base.T.base); - - Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T)); - Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); - - Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); - Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(alt_C,alt_C)); - - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); - VariantContextBuilder eval_1_builder = new VariantContextBuilder(); - VariantContextBuilder truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.alleles(Arrays.asList(reference_A,alt_C,alt_T)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - - Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - return testData; - } - - @Test(enabled=true) - public void testMismatchingAlleleInAlleleSubset() { - Pair data = getData2(); - VariantContext eval = data.getFirst(); - VariantContext truth = data.getSecond(); - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - metrics.update(eval,truth); - Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); - Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],1); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],1); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); - - // now flip them around - - eval = data.getSecond(); - truth = data.getFirst(); - codec = new VCFCodec(); - evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - metrics = new ConcordanceMetrics(evalHeader,compHeader); - metrics.update(eval,truth); - Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); - Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[1][2],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[1][2],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[3][2],1); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],0); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUBSET_TRUTH.ordinal()],1); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); - } - - private Pair getData3() { - - Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true); - Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base}); - Allele alt_A = Allele.create(BaseUtils.Base.A.base); - Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base}); - - Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT)); - Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); - Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_ACT,alt_A)); - - Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_AC)); - Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); - Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_ACT,alt_A)); - - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 5); - VariantContextBuilder eval_1_builder = new VariantContextBuilder(); - VariantContextBuilder truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.alleles(Arrays.asList(reference_ACT,alt_ATT,alt_A)); - truth_1_builder.alleles(Arrays.asList(reference_ACT,alt_AC,alt_A)); - eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - - Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - return testData; - } - - @Test(enabled=true) - public void testComplex() { - Pair data = getData3(); - VariantContext eval = data.getFirst(); - VariantContext truth = data.getSecond(); - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - metrics.update(eval,truth); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample1").getnMismatchingAlt(),1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[3][3],1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[1][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][2],1); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[3][3],1); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],0); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],1); - Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); - } - - private Pair getData4() { - - Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); - Allele alt_C = Allele.create(BaseUtils.Base.C.base); - Allele alt_T = Allele.create(BaseUtils.Base.T.base); - - Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); - Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); - - Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); - Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); - - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); - VariantContextBuilder eval_1_builder = new VariantContextBuilder(); - VariantContextBuilder truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.alleles(Arrays.asList(reference_A,alt_C,alt_T)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - - Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - return testData; - } - - @Test(enabled=true) - public void testNoCalls() { - Pair data = getData4(); - VariantContext eval = data.getFirst(); - VariantContext truth = data.getSecond(); - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - metrics.update(eval,truth); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[0][2],1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][0],1); - } - - private Pair getData5() { - - Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); - Allele alt_C = Allele.create(BaseUtils.Base.C.base); - Allele alt_T = Allele.create(BaseUtils.Base.T.base); - - Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList(0)); - Genotype sam_1_3_eval = GenotypeBuilder.create("test1_sample3", Arrays.asList(reference_A,alt_C)); - - Genotype sam_1_1_truth = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_1_2_truth = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); - Genotype sam_1_3_truth = GenotypeBuilder.create("test1_sample3", new ArrayList(0)); - - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); - VariantContextBuilder eval_1_builder = new VariantContextBuilder(); - VariantContextBuilder truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.alleles(Arrays.asList(reference_A,alt_C,alt_T)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - eval_1_builder.genotypes(Arrays.asList(sam_1_1_eval,sam_1_2_eval,sam_1_3_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_1_1_truth,sam_1_2_truth,sam_1_3_truth)); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - - Pair testData = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - return testData; - } - - @Test(enabled=true) - public void testMissing() { - Pair data = getData5(); - VariantContext eval = data.getFirst(); - VariantContext truth = data.getSecond(); - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - metrics.update(eval,truth); - Assert.assertTrue(eval.getGenotype("test1_sample2").getType().equals(GenotypeType.UNAVAILABLE)); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[0][2],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[4][2],1); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][1],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][3],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][0],0); - Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[2][4],1); - } - - private List> getData6() { - - Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); - Allele alt_C = Allele.create(BaseUtils.Base.C.base); - - - // site 1 - - // sample 1: hom-ref/hom-ref - // sample 2: het/hom-ref - - Genotype sam_2_1_1_eval = GenotypeBuilder.create("test2_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_2_2_1_eval = GenotypeBuilder.create("test2_sample2", Arrays.asList(reference_A,alt_C)); - - Genotype sam_2_1_1_truth = GenotypeBuilder.create("test2_sample1", Arrays.asList(reference_A,reference_A)); - Genotype sam_2_2_1_truth = GenotypeBuilder.create("test2_sample2", Arrays.asList(reference_A,reference_A)); - - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 3, 3); - VariantContextBuilder eval_1_builder = new VariantContextBuilder(); - VariantContextBuilder truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - eval_1_builder.genotypes(Arrays.asList(sam_2_1_1_eval,sam_2_2_1_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_2_1_1_truth,sam_2_2_1_truth)); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - - Pair testDataSite1 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - reference_A = Allele.create(BaseUtils.Base.A.base,true); - Allele alt_T = Allele.create(BaseUtils.Base.T.base); - - // site 2 - - // sample 1: no-call/hom-ref - // sample 2: hom-var/hom-var - - Genotype sam_2_1_2_eval = GenotypeBuilder.create("test2_sample1",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); - Genotype sam_2_2_2_eval = GenotypeBuilder.create("test2_sample2",Arrays.asList(alt_T,alt_T)); - Genotype sam_2_1_2_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(reference_A,reference_A)); - Genotype sam_2_2_2_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(alt_T,alt_T)); - - loc = genomeLocParser.createGenomeLoc("chr1", 4, 4); - eval_1_builder = new VariantContextBuilder(); - truth_1_builder = new VariantContextBuilder(); - - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - eval_1_builder.alleles(Arrays.asList(reference_A,alt_T)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_T)); - eval_1_builder.genotypes(Arrays.asList(sam_2_1_2_eval,sam_2_2_2_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_2_1_2_truth,sam_2_2_2_truth)); - - Pair testDataSite2 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - Allele alt_G = Allele.create(BaseUtils.Base.G.base); - - // site 3 - - // sample 1: alleles do not match - // sample 2: het/het - Genotype sam_2_1_3_eval = GenotypeBuilder.create("test2_sample1",Arrays.asList(alt_G,alt_T)); - Genotype sam_2_2_3_eval = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_T)); - Genotype sam_2_1_3_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(alt_T,alt_T)); - Genotype sam_2_2_3_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_T)); - - loc = genomeLocParser.createGenomeLoc("chr1",5,5); - eval_1_builder = new VariantContextBuilder(); - truth_1_builder = new VariantContextBuilder(); - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - eval_1_builder.alleles(Arrays.asList(reference_A,alt_T,alt_G)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_T)); - eval_1_builder.genotypes(Arrays.asList(sam_2_1_3_eval,sam_2_2_3_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_2_1_3_truth,sam_2_2_3_truth)); - - Pair testDataSite3 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - // site 4 - - // sample 1: unavailable/het - // sample 2: unavailable/ref - Genotype sam_2_1_4_eval = GenotypeBuilder.create("test2_sample1",new ArrayList(0)); - Genotype sam_2_2_4_eval = GenotypeBuilder.create("test2_sample2",new ArrayList(0)); - Genotype sam_2_1_4_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(reference_A,alt_T)); - Genotype sam_2_2_4_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,reference_A)); - - loc = genomeLocParser.createGenomeLoc("chr1",6,6); - eval_1_builder = new VariantContextBuilder(); - truth_1_builder = new VariantContextBuilder(); - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - eval_1_builder.alleles(Arrays.asList(reference_A,alt_T)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_T)); - eval_1_builder.genotypes(Arrays.asList(sam_2_1_4_eval,sam_2_2_4_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_2_1_4_truth,sam_2_2_4_truth)); - - Pair testDataSite4 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - // site 5 - - // sample 1: hom-var/no-call - // sample 2: het/het - Genotype sam_2_1_5_eval = GenotypeBuilder.create("test2_sample1",Arrays.asList(alt_C,alt_C)); - Genotype sam_2_2_5_eval = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_C)); - Genotype sam_2_1_5_truth = GenotypeBuilder.create("test2_sample1",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); - Genotype sam_2_2_5_truth = GenotypeBuilder.create("test2_sample2",Arrays.asList(reference_A,alt_C)); - - loc = genomeLocParser.createGenomeLoc("chr1",7,7); - eval_1_builder = new VariantContextBuilder(); - truth_1_builder = new VariantContextBuilder(); - eval_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - truth_1_builder.loc(loc.getContig(),loc.getStart(),loc.getStop()); - eval_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - truth_1_builder.alleles(Arrays.asList(reference_A,alt_C)); - eval_1_builder.genotypes(Arrays.asList(sam_2_1_5_eval,sam_2_2_5_eval)); - truth_1_builder.genotypes(Arrays.asList(sam_2_1_5_truth,sam_2_2_5_truth)); - - Pair testDataSite5 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - - return Arrays.asList(testDataSite1,testDataSite2,testDataSite3,testDataSite4,testDataSite5); - } - - @Test(enabled=true) - public void testMultiSite() { - int[][] sample1_expected = new int[GenotypeType.values().length][GenotypeType.values().length]; - int[][] sample2_expected = new int[GenotypeType.values().length][GenotypeType.values().length]; - // order: no-call,ref,het,hom-var,unavailable,mixed - sample1_expected[0] = new int[]{0,1,0,0,0,0}; - sample2_expected[0] = new int[]{0,0,0,0,0,0}; - sample1_expected[1] = new int[]{0,1,0,0,0,0}; - sample2_expected[1] = new int[]{0,0,0,0,0,0}; - sample1_expected[2] = new int[]{0,0,0,0,0,0}; - sample2_expected[2] = new int[]{0,1,2,0,0,0}; - sample1_expected[3] = new int[]{1,0,0,0,0,0}; - sample2_expected[3] = new int[]{0,0,0,1,0,0}; - sample1_expected[4] = new int[]{0,0,1,0,0,0}; - sample2_expected[4] = new int[]{0,1,0,0,0,0}; - - List> data = getData6(); - - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - - for ( Pair contextPair : data ) { - VariantContext eval = contextPair.getFirst(); - VariantContext comp = contextPair.getSecond(); - logger.warn(eval.toString()); - logger.warn(comp.toString()); - Assert.assertTrue(eval != null); - Assert.assertTrue(comp != null); - Assert.assertTrue(eval.getGenotype("test2_sample1") != null); - Assert.assertTrue(comp.getGenotype("test2_sample1") != null); - Assert.assertTrue(eval.getGenotype("test2_sample2") != null); - Assert.assertTrue(comp.getGenotype("test2_sample2") != null); - metrics.update(eval,comp); - } - - int[][] sample1_observed = metrics.getGenotypeConcordance("test2_sample1").getTable(); - int[][] sample2_observed = metrics.getGenotypeConcordance("test2_sample2").getTable(); - for ( GenotypeType eType : GenotypeType.values() ) { - for ( GenotypeType cType : GenotypeType.values() ) { - Assert.assertEquals(sample1_expected[eType.ordinal()][cType.ordinal()],sample1_observed[eType.ordinal()][cType.ordinal()]); - Assert.assertEquals(sample2_expected[eType.ordinal()][cType.ordinal()],sample2_observed[eType.ordinal()][cType.ordinal()]); - } - } - } - - @Test(enabled=true) - public void testNRD_testNRS_testMargins() { - Pair data = getData3(); - VariantContext eval = data.getFirst(); - VariantContext truth = data.getSecond(); - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - int[][] table = metrics.getOverallGenotypeConcordance().getTable(); - // set up the table - table[0] = new int[] {30, 12, 7, 5, 6, 0}; - table[1] = new int[] {10, 100, 5, 1, 7, 1}; - table[2] = new int[] {5, 7, 150, 3, 3, 1}; - table[3] = new int[] {3, 2, 6, 50, 1, 0}; - table[4] = new int[] {10, 6, 3, 3, 2, 0}; - table[5] = new int[] {12, 0, 34, 20, 10, 0}; - double EXPEC_NRS = 0.8969957; - double EXPEC_NRD = 0.1071429; - double EXPEC_OGC = 0.92592592; // (100+150+50)/(100+5+1+150+7+3+50+2+6) - Assert.assertEquals(EXPEC_NRS,metrics.getOverallNRS(),1e-7); - Assert.assertEquals(EXPEC_NRD,metrics.getOverallNRD(),1e-7); - Assert.assertEquals(EXPEC_OGC,metrics.getOverallOGC(),1e-7); - int EXPEC_EVAL_REF = 124; - int EXPEC_EVAL_HET = 169; - int EXPEC_EVAL_VAR = 62; - int EXPEC_COMP_REF = 127; - int EXPEC_COMP_HET = 205; - int EXPEC_COMP_VAR = 82; - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnEvalGenotypes(GenotypeType.HOM_REF),EXPEC_EVAL_REF); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnEvalGenotypes(GenotypeType.HET),EXPEC_EVAL_HET); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnEvalGenotypes(GenotypeType.HOM_VAR),EXPEC_EVAL_VAR); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnCompGenotypes(GenotypeType.HOM_REF),EXPEC_COMP_REF); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnCompGenotypes(GenotypeType.HET),EXPEC_COMP_HET); - Assert.assertEquals(metrics.getOverallGenotypeConcordance().getnCompGenotypes(GenotypeType.HOM_VAR),EXPEC_COMP_VAR); - } - - @Test(enabled=true) - public void testRobustness() { - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_1)))); - VCFHeader disjointCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_2)))); - VCFHeader overlapCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_3)))); - ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader); - ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader); - - // test what happens if you put in disjoint sets and start making requests - Assert.assertEquals(0,disjointMetrics.getPerSampleGenotypeConcordance().size()); - String msg = "No Exception Thrown"; - try { - disjointMetrics.getGenotypeConcordance("test3_sample4"); - } catch ( Exception e) { - msg = e.getMessage(); - } - Assert.assertEquals("Attempted to request the concordance table for sample test3_sample4 on which it was not calculated",msg); - - // test that the overlapping sample is in the overlapping table (basically do this without throwing an exception) - overlapMetrics.getGenotypeConcordance("test3_sample3"); - - String msg2 = "No Exception Thrown"; - try { - disjointMetrics.getGenotypeConcordance("test3_sample4"); - } catch ( Exception e) { - msg2 = e.getMessage(); - } - Assert.assertEquals("Attempted to request the concordance table for sample test3_sample4 on which it was not calculated",msg2); - - // test what happens if you try to calculate NRS and NRD on an empty table - Assert.assertEquals(disjointMetrics.getOverallNRD(), 1.0, 1e-16); - Assert.assertEquals(disjointMetrics.getOverallNRS(), 0.0, 1e-16); - } - - public List> getData7() { - - Allele ref1 = Allele.create(BaseUtils.Base.T.base,true); - Allele alt1 = Allele.create(BaseUtils.Base.C.base); - Allele alt2 = Allele.create(BaseUtils.Base.G.base); - Allele alt3 = Allele.create(BaseUtils.Base.A.base); - - GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1); - VariantContextBuilder site1Eval = new VariantContextBuilder(); - VariantContextBuilder site1Comp = new VariantContextBuilder(); - - - // site 1: eval superset comp - site1Eval.loc(loc1.getContig(),loc1.getStart(),loc1.getStop()); - site1Comp.loc(loc1.getContig(),loc1.getStart(),loc1.getStop()); - site1Eval.alleles(Arrays.asList(ref1,alt1,alt2)); - site1Comp.alleles(Arrays.asList(ref1,alt2)); - site1Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt2))); - site1Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt2)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt2))); - - // site 2: eval subset comp - GenomeLoc loc2 = genomeLocParser.createGenomeLoc("chr1",2,2); - VariantContextBuilder site2Eval = new VariantContextBuilder(); - VariantContextBuilder site2Comp = new VariantContextBuilder(); - site2Eval.loc(loc2.getContig(),loc2.getStart(),loc2.getStop()); - site2Comp.loc(loc2.getContig(),loc2.getStart(),loc2.getStop()); - site2Eval.alleles(Arrays.asList(ref1,alt1)); - site2Comp.alleles(Arrays.asList(ref1,alt1,alt3)); - site2Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); - site2Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt3)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); - - // site 3: eval only - GenomeLoc loc3 = genomeLocParser.createGenomeLoc("chr1",3,3); - VariantContextBuilder site3Eval = new VariantContextBuilder(); - VariantContextBuilder site3Comp = new VariantContextBuilder(); - site3Eval.loc(loc3.getContig(),loc3.getStart(),loc3.getStop()); - site3Comp.loc(loc3.getContig(),loc3.getStart(),loc3.getStop()); - site3Eval.alleles(Arrays.asList(ref1,alt1)); - site3Comp.alleles(Arrays.asList(ref1,alt1)); - site3Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); - site3Comp.genotypes(GenotypeBuilder.create("test2_sample1",new ArrayList(0)),GenotypeBuilder.create("test2_sample2",new ArrayList(0))); - - // site 4: comp only - monomorphic - GenomeLoc loc4 = genomeLocParser.createGenomeLoc("chr1",4,4); - VariantContextBuilder site4Eval = new VariantContextBuilder(); - VariantContextBuilder site4Comp = new VariantContextBuilder(); - site4Eval.loc(loc4.getContig(),loc4.getStart(),loc4.getStop()); - site4Comp.loc(loc4.getContig(),loc4.getStart(),loc4.getStop()); - site4Eval.alleles(Arrays.asList(ref1,alt1)); - site4Comp.alleles(Arrays.asList(ref1,alt1)); - site4Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,ref1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,ref1))); - site4Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); - - // site 5: overlapping - GenomeLoc loc5 = genomeLocParser.createGenomeLoc("chr1",5,5); - VariantContextBuilder site5Eval = new VariantContextBuilder(); - VariantContextBuilder site5Comp = new VariantContextBuilder(); - site5Eval.loc(loc5.getContig(),loc5.getStart(),loc5.getStop()); - site5Comp.loc(loc5.getContig(),loc5.getStart(),loc5.getStop()); - site5Eval.alleles(Arrays.asList(ref1,alt1,alt3)); - site5Comp.alleles(Arrays.asList(ref1,alt1,alt3)); - site5Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(alt1,alt3))); - site5Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(alt1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(alt3,alt3))); - - // site 6: some non-matching alts - GenomeLoc loc6 = genomeLocParser.createGenomeLoc("chr1",6,6); - VariantContextBuilder site6Eval = new VariantContextBuilder(); - VariantContextBuilder site6Comp = new VariantContextBuilder(); - site6Eval.loc(loc6.getContig(),loc6.getStart(),loc6.getStop()); - site6Comp.loc(loc6.getContig(),loc6.getStart(),loc6.getStop()); - site6Eval.alleles(Arrays.asList(ref1,alt1,alt2)); - site6Comp.alleles(Arrays.asList(ref1,alt1,alt3)); - site6Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt2))); - site6Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt3))); - - // site 7: matching with no-calls - GenomeLoc loc7 = genomeLocParser.createGenomeLoc("chr1",7,7); - VariantContextBuilder site7Eval = new VariantContextBuilder(); - VariantContextBuilder site7Comp = new VariantContextBuilder(); - site7Eval.loc(loc7.getContig(),loc7.getStart(),loc7.getStop()); - site7Comp.loc(loc7.getContig(),loc7.getStart(),loc7.getStop()); - site7Eval.alleles(Arrays.asList(ref1,alt1)); - site7Comp.alleles(Arrays.asList(ref1,alt1)); - site7Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL))); - site7Comp.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(ref1,alt1)),GenotypeBuilder.create("test2_sample2",Arrays.asList(ref1,alt1))); - - Pair site1 = new Pair(site1Eval.make(),site1Comp.make()); - Pair site2 = new Pair(site2Eval.make(),site2Comp.make()); - Pair site3 = new Pair(site3Eval.make(),site3Comp.make()); - Pair site4 = new Pair(site4Eval.make(),site4Comp.make()); - Pair site5 = new Pair(site5Eval.make(),site5Comp.make()); - Pair site6 = new Pair(site6Eval.make(),site6Comp.make()); - Pair site7 = new Pair(site7Eval.make(),site7Comp.make()); - - return Arrays.asList(site1,site2,site3,site4,site5,site6,site7); - } - - @Test(enabled = true) - public void testSites() { - VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); - - List> data = getData7(); - - int idx = 0; - int[] expecNotMatch = new int[]{0,0,0,0,0,1,1}; - for ( Pair varPair : data ) { - metrics.update(varPair.getFirst(),varPair.getSecond()); - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH),expecNotMatch[idx]); - logger.info(idx); - idx++; - } - - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH),1); - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH),2); - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_ONLY),1); - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.TRUTH_ONLY),1); - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUBSET_TRUTH),1); - Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH),1); - - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java deleted file mode 100644 index a7d32d43b..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java +++ /dev/null @@ -1,77 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -/** - * Tests LeftAlignAndTrimVariants - */ -public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { - - @Test - public void testLeftAlignment() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", - 1, - Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2")); - executeTest("test left alignment", spec); - } - - @Test - public void testLeftAlignmentWithTrimmingAndMultialleliecs() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split", - 1, - Arrays.asList("4ae03954f8bd66e73fd005c49ea301db")); - executeTest("test left alignment with trimming and hard multiple alleles", spec); - - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java deleted file mode 100644 index 884b46692..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ /dev/null @@ -1,347 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class SelectVariantsIntegrationTest extends WalkerTest { - public static String baseTestString(String args) { - return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; - } - - @Test - public void testDiscordanceNoSampleSpecified() { - String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " - + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", - 1, - Arrays.asList("954415f84996d27b07d00855e96d33a2") - ); - spec.disableShadowBCF(); - - executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec); - } - - @Test - public void testRepeatedLineSelection() { - String testfile = privateTestDir + "test.dup.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C --variant " + testfile), - 1, - Arrays.asList("125d1c9fa111cd38dfa2ff3900f16b57") - ); - - executeTest("testRepeatedLineSelection--" + testfile, spec); - } - - @Test - public void testDiscordance() { - String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " - + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", - 1, - Arrays.asList("ca1b5226eaeaffb78d4abd9d2ee10c43") - ); - spec.disableShadowBCF(); - - executeTest("testDiscordance--" + testFile, spec); - } - - @Test - public void testComplexSelection() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), - 1, - Arrays.asList("4386fbb258dcef4437495a37f5a83c53") - ); - spec.disableShadowBCF(); - executeTest("testComplexSelection--" + testfile, spec); - } - - @Test - public void testComplexSelectionWithNonExistingSamples() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), - 1, - Arrays.asList("4386fbb258dcef4437495a37f5a83c53") - ); - spec.disableShadowBCF(); - executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); - } - - @Test - public void testNonExistingFieldSelection() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -env -ef -select 'foo!=0||DP>0' --variant " + testfile), - 1, - Arrays.asList("44e77cea624cfff2b8acc3a4b30485cb") // should yield empty vcf because the foo!=0 will yield complete expression false - ); - spec.disableShadowBCF(); - executeTest("testNonExistingSelection--" + testfile, spec); - } - - @Test - public void testSampleExclusionFromFileAndSeparateSample() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, - 1, - Arrays.asList("1f5c72951a35667c4bdf1be153787e27") - ); - spec.disableShadowBCF(); - - executeTest("testSampleExclusion--" + testfile, spec); - } - - @Test - public void testSampleExclusionJustFromFile() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sf " + samplesFile + " --variant " + testfile, - 1, - Arrays.asList("875d7e00ac8081e87ab9fb1b20c83677") - ); - spec.disableShadowBCF(); - - executeTest("testSampleExclusion--" + testfile, spec); - } - - @Test - public void testSampleInclusionWithNonexistingSamples() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, - 1, - UserException.BadInput.class - ); - spec.disableShadowBCF(); - - executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); - } - - - @Test - public void testConcordance() { - String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " - + b37hapmapGenotypes + " --variant " + testFile - + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", - 1, - Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46") - ); - spec.disableShadowBCF(); - - executeTest("testConcordance--" + testFile, spec); - } - - @Test - public void testVariantTypeSelection() { - String testFile = privateTestDir + "complexExample1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") - ); - - executeTest("testVariantTypeSelection--" + testFile, spec); - } - - @Test - public void testIndelLengthSelection() { - String testFile = privateTestDir + "complexExample1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", - 1, - Arrays.asList("004589868ca5dc887e2dff876b4cc797") - ); - - executeTest("testIndelLengthSelection--" + testFile, spec); - } - - @Test - public void testUsingDbsnpName() { - String testFile = privateTestDir + "combine.3.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") - ); - - executeTest("testUsingDbsnpName--" + testFile, spec); - } - - @Test - public void testRemoveMLE() { - String testFile = privateTestDir + "vcfexample.withMLE.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") - ); - - executeTest("testRemoveMLE--" + testFile, spec); - } - - @Test - public void testKeepOriginalAC() { - String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants --keepOriginalAC -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("ad7e8b25e431a3229a78cec063876559") - ); - - executeTest("testKeepOriginalAC--" + testFile, spec); - } - - @Test - public void testKeepOriginalACAndENV() { - String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("e9b8292212545684cdb163423329ee7e") - ); - - executeTest("testKeepOriginalACAndENV--" + testFile, spec); - } - - @Test - public void testMultipleRecordsAtOnePosition() { - String testFile = privateTestDir + "selectVariants.onePosition.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("44f7c47395ca5b2afef5313f592c8cea") - ); - - executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec); - } - - @Test - public void testNoGTs() { - String testFile = privateTestDir + "vcf4.1.example.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542") - ); - - executeTest("testNoGTs--" + testFile, spec); - } - - @Test - public void testSelectFromMultiAllelic() { - String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; - String samplesFile = privateTestDir + "GIH.samples.list"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, - 1, - Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") - ); - executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); - } - - @Test() - public void testFileWithoutInfoLineInHeader() { - testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", IllegalStateException.class); - } - - @Test() - public void testFileWithoutInfoLineInHeaderWithOverride() { - testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null); - } - - private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) { - final String testFile = privateTestDir + "missingHeaderLine.vcf"; - final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " - + testFile + " -o %s --no_cmdline_in_header" - + (expectedException == null ? " -U LENIENT_VCF_PROCESSING" : ""); - WalkerTestSpec spec = - expectedException != null - ? new WalkerTestSpec(cmd, 1, expectedException) - : new WalkerTestSpec(cmd, 1, Arrays.asList("")); - spec.disableShadowBCF(); - - executeTest(name, spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java deleted file mode 100644 index 4d7fa28ad..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java +++ /dev/null @@ -1,105 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class SelectVariantsParallelIntegrationTest extends WalkerTest { - - private class ParallelSelectTestProvider extends TestDataProvider { - final String reference; - final String args; - final String md5; - final int nt; - - private ParallelSelectTestProvider(final String reference, final String args, final String md5, final int nt) { - super(ParallelSelectTestProvider.class); - this.reference = reference; - this.args = args; - this.md5 = md5; - this.nt = nt; - } - - public final String getCmdLine() { - return "-T SelectVariants -R " + reference + " -o %s -L 1 --no_cmdline_in_header -nt " + nt + " " + args; - } - - public String toString() { - return String.format("ParallelSelectVariants nt=%d args=%s", nt, args); - } - } - - @DataProvider(name = "ParallelSelectTest") - public Object[][] makeParallelSelectTestProvider() { - for ( int nt : Arrays.asList(1, 2, 4) ) { - { // original MAF test - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - String args = " -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile; - new ParallelSelectTestProvider(b36KGReference, args, "4386fbb258dcef4437495a37f5a83c53", nt); - } - { // new tests on b37 using testdir VCF - final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; - final String args = "-select 'DP > 30' -V " + testfile; - new ParallelSelectTestProvider(b37KGReference, args, "c64b45a14d41b1e5cddbe036b47e7519", nt); - } - } - - return ParallelSelectTestProvider.getTests(ParallelSelectTestProvider.class); - } - - @Test(dataProvider = "ParallelSelectTest") - public void testParallelSelectTestProvider(final ParallelSelectTestProvider cfg) { - final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); - executeTest(cfg.toString(), spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java b/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java deleted file mode 100644 index 88f5910f7..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java +++ /dev/null @@ -1,127 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.utils; - -import com.sun.istack.internal.NotNull; - -import java.util.Random; - -/** - * Random DNA sequence generator. - * - *

- * Returned bases are always in upper case and one of the valid four nocleotides 'A', 'C', 'G' and 'T'. - *

- * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class RandomDNA { - - private Random random; - - /** - * Constructs a new random DNA generator. - * - *

- * The seed would be the default which would depend on system properties and the current time as - * described in {@link Random} documentation. - *

- */ - public RandomDNA() { - random = new Random(); - } - - /** - * Constructs a new random DNA generator providing a seed. - * - * @param seed the random number generator seed. - */ - public RandomDNA(final long seed) { - random = new Random(seed); - } - - /** - * Updates the content of a byte array with a random base sequence. - * - *

- * The whole array will be filled with new base values. - *

- * - * @param destination the array to update. - * - * @throws NullPointerException if {@code destination} is {@code null}. - */ - public void nextBases(final byte[] destination) { - random.nextBytes(destination); - for (int i = 0; i < destination.length; i++) { - final int ord = destination[i] & 0x03; - switch (ord) { - case 0: destination[i] = 'A'; break; - case 1: destination[i] = 'C'; break; - case 2: destination[i] = 'G'; break; - case 3: destination[i] = 'T'; break; - default: throw new IllegalStateException("this cannot be happening!!!"); - } - } - } - - /** - * Returns a random RNA sequence of bases. - * @param size the length of the sequence. - * - * @throws IllegalArgumentException if {@code size} is negative. - * @return never {@code null}. - */ - @NotNull - public byte[] nextBases(final int size) { - if (size < 0) throw new IllegalArgumentException("the size cannot be negative"); - final byte[] result = new byte[size]; - nextBases(result); - return result; - } - - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java deleted file mode 100644 index 5c14c490e..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java +++ /dev/null @@ -1,362 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.gvcf; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class GVCFWriterUnitTest extends BaseTest { - private static class MockWriter implements VariantContextWriter { - final List emitted = new ArrayList<>(); - boolean headerWritten = false; - boolean closed = false; - - @Override - public void writeHeader(VCFHeader header) { - headerWritten = true; - } - - @Override - public void close() { - closed = true; - } - - @Override - public void add(VariantContext vc) { - emitted.add(vc); - } - } - - private MockWriter mockWriter; - private List standardPartition = Arrays.asList(1, 10, 20); - private Allele REF = Allele.create("N", true); - private Allele ALT = Allele.create("A"); - private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - private final String SAMPLE_NAME = "XXYYZZ"; - - @BeforeMethod - public void setUp() throws Exception { - mockWriter = new MockWriter(); - } - - @Test - public void testHeaderWriting() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - writer.writeHeader(new VCFHeader()); - Assert.assertTrue(mockWriter.headerWritten); - } - - @Test - public void testClose() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - writer.close(); - Assert.assertTrue(mockWriter.closed); - } - - @Test - public void testCloseWithoutClosingUnderlyingWriter() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - writer.close(false); - Assert.assertFalse(mockWriter.closed); - } - - private VariantContext makeHomRef(final String contig, final int start, final int GQ) { - final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, ALLELES); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); - gb.GQ(GQ); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - private VariantContext makeHomRefAlt(final String contig, final int start, final int GQ) { - final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); - gb.GQ(GQ); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - private VariantContext makeNonRef(final String contig, final int start, final int GQ) { - final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); - gb.GQ(GQ); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - private VariantContext makeDeletion(final String contig, final int start, final int size) { - final String del = Utils.dupString("A", size); - final String alt = del.substring(0, 1); - final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", contig, start, Arrays.asList(del, alt)); - final VariantContextBuilder vcb = new VariantContextBuilder(vc); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(vc.getReference(), vc.getAlternateAllele(0))); - gb.GQ(50); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - @Test - public void testCloseEmitsLastVariant() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - - writer.close(); - Assert.assertTrue(mockWriter.closed); - Assert.assertEquals(mockWriter.emitted.size(), 1); - } - - @Test - public void testCloseDoesntEmitsLastVariantWhenNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeNonRef("20", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 1); - - writer.close(); - Assert.assertTrue(mockWriter.closed); - Assert.assertEquals(mockWriter.emitted.size(), 1); - } - - @Test - public void testCrossingContigBoundaryRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - writer.add(makeHomRef("20", 2, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeHomRef("21", 3, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 1); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, false); - } - - @Test - public void testCrossingContigBoundaryNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - writer.add(makeHomRef("20", 2, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeNonRef("21", 3, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, true); - } - - @Test - public void testCrossingContigBoundaryNonRefThenNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeNonRef("20", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 1); - writer.add(makeNonRef("21", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 1, true); - assertGoodVC(mockWriter.emitted.get(1), "21", 1, 1, true); - } - - private void assertGoodVC(final VariantContext vc, final String contig, final int start, final int stop, final boolean nonRef) { - Assert.assertEquals(vc.getChr(), contig); - Assert.assertEquals(vc.getStart(), start); - Assert.assertEquals(vc.getEnd(), stop); - if ( nonRef ) { - Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - } else { - Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1); - Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); - Assert.assertTrue(vc.hasGenotypes()); - Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME)); - Assert.assertEquals(vc.getGenotypes().size(), 1); - final Genotype g = vc.getGenotype(SAMPLE_NAME); - Assert.assertEquals(g.hasAD(), false); - Assert.assertEquals(g.hasLikelihoods(), true); - Assert.assertEquals(g.hasPL(), true); - Assert.assertEquals(g.getPL().length == 3, true); - Assert.assertEquals(g.hasDP(), true); - Assert.assertEquals(g.hasGQ(), true); - } - } - - @Test - public void testVariantForcesNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - writer.add(makeHomRef("20", 2, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeNonRef("20", 3, 30)); - writer.add(makeHomRef("20", 4, 30)); - writer.add(makeHomRef("20", 5, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 3, 3, true); - writer.close(); - assertGoodVC(mockWriter.emitted.get(2), "20", 4, 5, false); - } - - @Test - public void testEmittingTwoBands() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeHomRef("20", 3, 50)); - writer.add(makeHomRef("20", 4, 50)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 3, 4, false); - } - - @Test - public void testNonContiguousBlocks() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - writer.add(makeHomRef("20", 10, 0)); - writer.add(makeHomRef("20", 11, 0)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 10, 11, false); - } - - @Test - public void testDeletion() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - writer.add(makeDeletion("20", 3, 3)); - writer.add(makeHomRef("20", 4, 0)); - writer.add(makeHomRef("20", 5, 0)); - writer.add(makeHomRef("20", 6, 0)); - writer.add(makeHomRef("20", 7, 0)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 3); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 3, 5, true); - assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); - } - - @Test - public void testHomRefAlt() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - writer.add(makeHomRefAlt("20", 3, 0)); - writer.add(makeHomRef("20", 4, 0)); - writer.add(makeHomRef("20", 5, 0)); - writer.add(makeHomRef("20", 6, 0)); - writer.add(makeHomRef("20", 7, 0)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 3); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); - Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); - assertGoodVC(mockWriter.emitted.get(2), "20", 4, 7, false); - } - - @DataProvider(name = "BandPartitionData") - public Object[][] makeBandPartitionData() { - List tests = new ArrayList<>(); - - tests.add(new Object[]{null, false}); - tests.add(new Object[]{Collections.emptyList(), false}); - tests.add(new Object[]{Arrays.asList(1), true}); - tests.add(new Object[]{Arrays.asList(1, 10), true}); - tests.add(new Object[]{Arrays.asList(1, 10, 30), true}); - tests.add(new Object[]{Arrays.asList(10, 1, 30), false}); - tests.add(new Object[]{Arrays.asList(-1, 1), false}); - tests.add(new Object[]{Arrays.asList(1, null, 10), false}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BandPartitionData") - public void testMyData(final List partitions, final boolean expectedGood) { - try { - GVCFWriter.parsePartitions(partitions); - Assert.assertTrue(expectedGood, "Expected to fail but didn't"); - } catch ( Exception e ) { - Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java deleted file mode 100644 index 337f23afe..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ /dev/null @@ -1,99 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -// ********************************************************************************** // -// Note that this class also serves as an integration test for the VariantAnnotator! // -// ********************************************************************************** // - -public class NanoSchedulerIntegrationTest extends WalkerTest { - @DataProvider(name = "NanoSchedulerUGTest") - public Object[][] createNanoSchedulerUGTest() { - List tests = new ArrayList(); - - for ( final int nt : Arrays.asList(1, 2) ) - for ( final int nct : Arrays.asList(1, 2) ) { -// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); -//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "a80925b58735828158491f77ae64998b", nt, nct }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") - private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T UnifiedGenotyper -R " + b37KGReference, - "--no_cmdline_in_header -G", - //"--dbsnp " + b37dbSNP132, - "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", - "-L 20:10,000,000-10,100,000", - "-glm " + glm, - "--contamination_fraction_to_filter 0.0", - "-nt " + nt, - "-nct " + nct, - "-o %s" - ), - 1, - Arrays.asList(md5) - ); - executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); - } - - - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java deleted file mode 100644 index 84b995749..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java +++ /dev/null @@ -1,588 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.commons.math.distribution.ExponentialDistribution; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResultSet; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Civar; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** -* Mock-up active region data used in testing. -* -* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> -*/ -public class ActiveRegionTestDataSet { - - private final byte[] referenceBytes; - protected String reference; - protected String[] haplotypeCigars; - protected List haplotypeStrings; - protected String[] readCigars; - protected byte[] bq; - protected byte[] dq; - protected byte[] iq; - protected int kmerSize; - private List haplotypeList; - private List readList; - private AssemblyResultSet assemblyResultSet; - private Map readBySequence; - private String stringRepresentation; - private List> readEventOffsetList; - private GenomeLocParser genomeLocParser; - - /** Create a new active region data test set */ - public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, - final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { - this.reference = reference; - this.referenceBytes = reference.getBytes(); - this.haplotypeCigars = haplotypes; - this.readCigars = readCigars; - this.bq = bq; - this.dq = dq; - this.iq = iq; - this.kmerSize = kmerSize; - this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); - } - - public String getReference() { - return reference; - } - - public String toString() { - if (stringRepresentation == null) - return super.toString(); - else return stringRepresentation; - } - - public AssemblyResultSet assemblyResultSet() { - if (assemblyResultSet == null) { - final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); - rtg.addSequence("anonymous", this.getReference().getBytes(), null, true); - for (final String haplotype : this.haplotypesStrings()) { - rtg.addSequence("anonymous", haplotype.getBytes(), null, false); - } - rtg.buildGraphIfNecessary(); - if (rtg.hasCycles()) - throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); - - List haplotypeList = this.haplotypeList(); - - assemblyResultSet = new AssemblyResultSet(); - final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? - AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); - ar.setThreadingGraph(rtg); - - for (final Haplotype h : haplotypeList) - assemblyResultSet.add(h, ar); - } - return assemblyResultSet; - } - - public List haplotypesStrings() { - if (haplotypeStrings != null) { - return haplotypeStrings; - } - final List result = new ArrayList<>(haplotypeCigars.length); - String reference = this.reference; - for (final String cigar : haplotypeCigars) { - if (cigar.matches("^Civar:.*$")) { - stringRepresentation = cigar.substring(6); - result.addAll(expandAllCombinations(cigar.substring(6),reference)); - } else if (cigar.matches("^.*\\d+.*$")) { - result.add(applyCigar(reference, cigar,0,true)); - } else { - result.add(cigar); - } - } - haplotypeStrings = result; - return result; - } - - private List expandAllCombinations(final String cigarString, final String reference) { - final Civar civar = Civar.fromCharSequence(cigarString); - final List unrolledCivars = civar.optionalizeAll().unroll(); - List result = new ArrayList<>(unrolledCivars.size()); - for (final Civar c : unrolledCivars) { - result.add(c.applyTo(reference)); - } - return result; - } - - private List expandAllHaplotypeCombinations(final String civarString, final String reference) { - final Civar civar = Civar.fromCharSequence(civarString); - final List unrolledCivars = civar.optionalizeAll().unroll(); - List result = new ArrayList<>(unrolledCivars.size()); - for (final Civar c : unrolledCivars) { - final String baseString = c.applyTo(reference); - final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); - haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - try { - haplotype.setCigar(c.toCigar(reference.length())); - } catch (final RuntimeException ex) { - c.applyTo(reference); - c.toCigar(reference.length()); - throw new RuntimeException("" + c + " " + ex.getMessage(),ex); - } - result.add(haplotype); - } - return result; - } - - - public List haplotypeList() { - if (haplotypeList == null) { - - final List result = new ArrayList<>(haplotypeCigars.length); - final String reference = this.reference; - for (final String cigar : haplotypeCigars) { - if (cigar.matches("^Civar:.*$")) { - stringRepresentation = cigar.substring(6); - result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); - } else if (cigar.matches("^.*\\d+.*$")) { - result.add(cigarToHaplotype(reference, cigar, 0, true)); - } else { - final Haplotype h = new Haplotype(cigar.getBytes()); - h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - result.add(h); - } - } - haplotypeList = result; - } - return haplotypeList; - } - - - protected SAMSequenceDictionary artificialSAMSequenceDictionary() { - return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); - } - - protected SAMFileHeader artificialSAMFileHeader() { - return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); - } - - public List readList() { - if (readList == null) { - final SAMFileHeader header = artificialSAMFileHeader(); - readList = new ArrayList<>(readCigars.length); - final List haplotypes = haplotypesStrings(); - int count = 0; - for (final String descr : readCigars) { - String sequence; - if (descr.matches("^\\d+:\\d+:.+$")) { - final String[] parts = descr.split(":"); - int allele = Integer.valueOf(parts[0]); - int offset = Integer.valueOf(parts[1]); - final String cigar = parts[2]; - final String base = allele == 0 ? reference : haplotypes.get(allele - 1); - sequence = applyCigar(base, cigar, offset, false); - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); - readList.add(new MyGATKSAMRecord(samRecord)); - } else if (descr.matches("^\\*:\\d+:\\d+$")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); - } else { - sequence = descr; - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); - readList.add(new MyGATKSAMRecord(samRecord)); - } - count = readList.size(); - } - } - return readList; - } - - public List> readEventOffsetList() { - if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) - throw new UnsupportedOperationException(); - if (readEventOffsetList == null) { - final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); - final List unrolledCivars = civar.optionalizeAll().unroll(); - - readEventOffsetList = new ArrayList<>(readCigars.length); - int count = 0; - for (final String descr : readCigars) { - if (descr.matches("^\\d+:\\d+:.+$")) { - throw new UnsupportedOperationException(); - } else if (descr.matches("^\\*:\\d+:\\d+$")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); - } else { - throw new UnsupportedOperationException(); - } - count = readEventOffsetList.size(); - } - readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); - } - return readEventOffsetList; - } - - - - - @SuppressWarnings("unused") - public String cigarToSequence(final String cigar) { - String reference = this.reference; - return applyCigar(reference, cigar,0,true); - } - - @SuppressWarnings("unused") - public GATKSAMRecord readFromString(final String readSequence) { - if (readBySequence == null) { - final List readList = readList(); - readBySequence = new HashMap<>(readList.size()); - for (final GATKSAMRecord r : readList) - readBySequence.put(r.getReadString(),r); - } - return readBySequence.get(readSequence); - } - - public List unrolledCivars() { - if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) - throw new UnsupportedOperationException(); - final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); - return civar.optionalizeAll().unroll(); - } - - public void introduceErrors(final Random rnd) { - final List reads = readList(); - final ArrayList result = new ArrayList<>(reads.size()); - for (final GATKSAMRecord read : reads) { - result.add(new MyGATKSAMRecord(read,rnd)); - } - readList = result; - } - - private class MyGATKSAMRecord extends GATKSAMRecord { - protected MyGATKSAMRecord(final GATKSAMRecord r) { - super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), - (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), - r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), - new byte[0]); - this.setReadBases(r.getReadBases()); - this.setBaseQualities(r.getBaseQualities()); - this.setReadName(r.getReadName()); - } - - ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); - - public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { - super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), - (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), - r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), - new byte[0]); - final byte[] bases = new byte[r.getReadBases().length]; - - final byte[] readBases = r.getReadBases(); - final byte[] bq = r.getBaseQualities(); - final byte[] iq = r.getBaseInsertionQualities(); - final byte[] dq = r.getBaseDeletionQualities(); - int refOffset = r.getAlignmentStart() - 1; - int readOffset = 0; - for (int i = 0; i < r.getReadBases().length;) { - double p = rnd.nextDouble(); - double iqp = QualityUtils.qualToErrorProb(iq[i]); - if (p < iqp) { // insertion - final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); - final int refStart = rnd.nextInt(reference.length() - length); - System.arraycopy(referenceBytes,refStart,bases,i,length); - i += length; - continue; - } - p -= iqp; - double dqp = QualityUtils.qualToErrorProb(dq[i]); - if (p < dqp) { - final int length = generateIndelLength(rnd); - refOffset += length; - refOffset = refOffset % referenceBytes.length; - readOffset += length; - continue; - } - p -= dqp; - double bqp = QualityUtils.qualToErrorProb(bq[i]); - byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; - byte nb; - if (p < bqp) { - switch (b) { - case 'A': nb = 'C'; break; - case 'T': nb = 'A'; break; - case 'C': nb = 'G'; break; - case 'G': nb = 'B'; break; - default: nb = 'A'; - } - } else - nb = b; - - bases[i++] = nb; - refOffset++; - refOffset = refOffset % referenceBytes.length; - readOffset++; - } - this.setReadBases(bases); - this.setBaseQualities(r.getBaseQualities()); - this.setReadName(r.getReadName()); - - - } - - private int generateIndelLength(final Random rnd) { - final int length; - try { - length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); - } catch (Exception e) { - throw new RuntimeException(e); - } - return length; - } - - @Override - public byte[] getBaseDeletionQualities() { - return Arrays.copyOf(dq,getReadLength()); - } - - @Override - public byte[] getBaseInsertionQualities() { - return Arrays.copyOf(iq,getReadLength()); - } - - @Override - public int getMappingQuality() { - return 100; - } - - @Override - public int hashCode() { - return getReadName().hashCode(); - } - - @Override - public boolean equals(Object o) { - if (o instanceof GATKSAMRecord) { - return getReadName().equals(((GATKSAMRecord)o).getReadName()); - } else { - return false; - } - } - - public String toString() { - return super.toString() + " " + this.getReadString(); - } - } - - - public List readStrings() { - final List result = new ArrayList<>(readCigars.length); - final List haplotypes = haplotypesStrings(); - for (final String descr : readCigars) { - String sequence; - if (descr.matches("^\\d+:\\d+:.+$")) { - final String[] parts = descr.split(":"); - int allele = Integer.valueOf(parts[0]); - int offset = Integer.valueOf(parts[1]); - final String cigar = parts[2]; - final String base = allele == 0 ? reference : haplotypes.get(allele - 1); - sequence = applyCigar(base, cigar, offset, false); - result.add(sequence); - } else if (descr.matches("\\*:^\\d+:\\d+")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - result.addAll(generateReads(haplotypes, readCount, readLength)); - } else { - sequence = descr; - result.add(sequence); - } - } - return result; - } - - private List generateReads(final List haplotypes, final int readCount, final int readLength) { - final List result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % haplotypes.size(); - final String h = haplotypes.get(hi); - int offset = i % h.length() - readLength; - result.add(h.substring(offset,offset + readLength)); - } - return result; - } - - private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { - int id = idStart; - final List result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % haplotypes.size(); - final String h = haplotypes.get(hi); - int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); - int to = Math.min(h.length(),offset + readLength); - byte[] bases = h.substring(offset,to).getBytes(); - byte[] quals = Arrays.copyOf(bq,to - offset); - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); - result.add(new MyGATKSAMRecord(samRecord)); - } - return result; - } - - - private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { - - final List> result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % unrolledCivars.size(); - final Civar c = unrolledCivars.get(hi); - final String h = haplotypes.get(hi); - int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); - int to = Math.min(h.length(),offset + readLength); - result.add(c.eventOffsets(reference,offset,to)); - } - return result; - } - - private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); - - - private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { - final String sequence = applyCigar(reference,cigar,offset,global); - final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); - haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); - return haplotype; - } - - private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { - final Matcher pm = cigarPattern.matcher(cigar); - StringBuffer sb = new StringBuffer(); - int index = offset; - while (pm.find()) { - int length = Integer.valueOf(pm.group(1)); - char operator = pm.group(2).charAt(0); - switch (operator) { - case '=' : - try { - sb.append(reference.substring(index, index + length)); - } catch (Exception e) { - throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); - } - index += length; break; - case 'D' : - index += length; break; - case 'I' : - String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); - sb.append(insert); break; - case 'V' : - sb.append(transversionV(reference.charAt(index))); index++; break; - case 'W' : - sb.append(transversionW(reference.charAt(index))); index++; break; - case 'T' : - sb.append(transition(reference.charAt(index))); index++; break; - default: - throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); - } - } - if (global && index != reference.length()) { - throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); - } else if (index > reference.length()) { - throw new RuntimeException(" index beyond end "); - } - return sb.toString(); - } - - protected int kmerSize() { - return kmerSize; - } - - private char transversionV(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'C'; - case 'G': return 'T'; - case 'C': return 'A'; - case 'T': return 'G'; - default: - return c; - } - - } - - private char transversionW(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'T'; - case 'G': return 'C'; - case 'T': return 'A'; - case 'C': return 'G'; - default: - return c; - } - - } - - private char transition(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'G'; - case 'G': return 'A'; - case 'T': return 'C'; - case 'C': return 'T'; - default: - return c; - } - - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java deleted file mode 100644 index 2d3d680df..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java +++ /dev/null @@ -1,112 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.ContextCovariate; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ContextCovariateUnitTest { - ContextCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ContextCovariate(); - covariate.initialize(RAC); - - } - - @Test(enabled = true) - public void testSimpleContexts() { - GATKSAMRecord read = ReadUtils.createRandomRead(1000); - GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - - verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - } - - public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { - for (int i = 0; i < values.length; i++) - Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); - - } - - public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { - final String bases = stringFrom(read.getReadBases()); - String expectedContext = null; - if (offset - contextSize + 1 >= 0) { - String context = bases.substring(offset - contextSize + 1, offset + 1); - if (!context.contains("N")) - expectedContext = context; - } - return expectedContext; - } - - private static String stringFrom(byte[] array) { - String s = ""; - for (byte value : array) - s += (char) value; - return s; - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java deleted file mode 100644 index ce827065b..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java +++ /dev/null @@ -1,130 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class CycleCovariateUnitTest { - CycleCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new CycleCovariate(); - covariate.initialize(RAC); - } - - @Test(enabled = true) - public void testSimpleCycles() { - short readLength = 10; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); - - read.setReadNegativeStrandFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); - - read.setSecondOfPairFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); - - read.setReadNegativeStrandFlag(false); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); - } - - private void verifyCovariateArray(int[][] values, int init, int increment) { - for (short i = 0; i < values.length; i++) { - short actual = Short.decode(covariate.formatKey(values[i][0])); - int expected = init + (increment * i); - Assert.assertEquals(actual, expected); - } - } - - @Test(enabled = true, expectedExceptions={UserException.class}) - public void testMoreThanMaxCycleFails() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } - - @Test(enabled = true) - public void testMaxCyclePasses() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java deleted file mode 100644 index f20d6116b..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java +++ /dev/null @@ -1,137 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Random; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class ReadCovariatesUnitTest { - - @Test(enabled = false) - public void testCovariateGeneration() { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final String RGID = "id"; - - ReadGroupCovariate rgCov = new ReadGroupCovariate(); - QualityScoreCovariate qsCov = new QualityScoreCovariate(); - ContextCovariate coCov = new ContextCovariate(); - CycleCovariate cyCov = new CycleCovariate(); - - rgCov.initialize(RAC); - qsCov.initialize(RAC); - coCov.initialize(RAC); - cyCov.initialize(RAC); - - Covariate[] requestedCovariates = new Covariate[4]; - requestedCovariates[0] = rgCov; - requestedCovariates[1] = qsCov; - requestedCovariates[2] = coCov; - requestedCovariates[3] = cyCov; - - final int NUM_READS = 100; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - final String[] readGroups = {"RG1", "RG2", "RGbla"}; - for (int idx = 0; idx < NUM_READS; idx++) { - for (final String rgs : readGroups) { - final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); - rg.setPlatform("illumina"); - read.setReadGroup(rg); - read.setReadNegativeStrandFlag(rnd.nextBoolean()); - final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); - final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); - final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, length); - Assert.assertEquals(rc.getInsertionsKeySet().length, length); - Assert.assertEquals(rc.getDeletionsKeySet().length, length); - - for (int i = 0; i < length; i++) { - // check that read group is always the same - Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); - - // check quality score - Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); - - // check context - Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - - // check cycle - Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); - } - - } - - } - - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java deleted file mode 100644 index 0b2df6369..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java +++ /dev/null @@ -1,115 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.ReadGroupCovariate; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ReadGroupCovariateUnitTest { - ReadGroupCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ReadGroupCovariate(); - covariate.initialize(RAC); - } - - @Test(enabled = true) - public void testSingleRecord() { - final String expected = "SAMPLE.1"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); - rg.setPlatformUnit(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testMissingPlatformUnit() { - final String expected = "MY.7"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testForceReadgroup() { - final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); - forcedRAC.FORCE_READGROUP = "FOO"; - final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); - forcedCovariate.initialize(forcedRAC); - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); - runTest(rg, "FOO", forcedCovariate); - } - - private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { - GATKSAMRecord read = ReadUtils.createRandomRead(10); - read.setReadGroup(rg); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); - - } - - private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { - for (int[] value : values) { - String actual = covariate.formatKey(value[0]); - Assert.assertEquals(actual, expected); - } - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java deleted file mode 100644 index 7d1e51385..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ /dev/null @@ -1,165 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class RecalibrationReportUnitTest { - private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); - final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); - return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); - } - - @Test(enabled = true) - public void testOutput() { - final int length = 100; - - List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - - for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { - quals.add((byte) i); - counts.add(1L); - } - - final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - quantizationInfo.noQuantization(); - final List requiredCovariates = new LinkedList(); - final List optionalCovariates = new LinkedList(); - - final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); - rgCovariate.initialize(RAC); - requiredCovariates.add(rgCovariate); - - final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); - qsCovariate.initialize(RAC); - requiredCovariates.add(qsCovariate); - - final ContextCovariate cxCovariate = new ContextCovariate(); - cxCovariate.initialize(RAC); - optionalCovariates.add(cxCovariate); - final CycleCovariate cyCovariate = new CycleCovariate(); - cyCovariate.initialize(RAC); - optionalCovariates.add(cyCovariate); - - final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - int covariateIndex = 0; - for (final Covariate cov : requiredCovariates) - requestedCovariates[covariateIndex++] = cov; - for (final Covariate cov : optionalCovariates) - requestedCovariates[covariateIndex++] = cov; - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); - rg.setPlatform("illumina"); - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - read.setReadGroup(rg); - final byte [] readQuals = new byte[length]; - for (int i = 0; i < length; i++) - readQuals[i] = 20; - read.setBaseQualities(readQuals); - - final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); - int nKeys = 0; // keep track of how many keys were produced - final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); - final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - - for (int offset = 0; offset < length; offset++) { - - for (EventType errorMode : EventType.values()) { - - final int[] covariates = rc.getKeySet(offset, errorMode); - final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); - nKeys += 2; - for (int j = 0; j < optionalCovariates.size(); j++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); - final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; - if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); - nKeys++; - } - } - } - } - Assert.assertEquals(nKeys, expectedKeys); - } - - private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { - final int numCovariates = 4; - final int numTables = 3; - final int mismatchContextPadding = mismatchesContextSize - 1; - final int indelContextPadding = 2 * (indelContextSize - 1); - final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); - - return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java deleted file mode 100644 index ea70deeea..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java +++ /dev/null @@ -1,239 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Random; - -public class RepeatCovariatesUnitTest { - - RepeatLengthCovariate rlCovariate; - RepeatUnitCovariate ruCovariate; - RepeatUnitAndLengthCovariate rurlCovariate; - RecalibrationArgumentCollection RAC; - - - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - rlCovariate = new RepeatLengthCovariate(); - ruCovariate = new RepeatUnitCovariate(); - rurlCovariate = new RepeatUnitAndLengthCovariate(); - rlCovariate.initialize(RAC); - ruCovariate.initialize(RAC); - rurlCovariate.initialize(RAC); - } - - - @Test(enabled = true) - public void testFindNumberOfRepetitions() { - // First, test logic to compute number of repetitions of a substring on a given string. - int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); - Assert.assertEquals(1,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); - Assert.assertEquals(0,result); - // Same tests but looking backward on string - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); - Assert.assertEquals(3,result); - - // test logic to get repeat unit and number of repeats from covariate value - final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; - for (String ru : repUnits) { - for (int k=1; k < 10; k++) { - Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); - Assert.assertEquals(pair.second.intValue(),k); - Assert.assertEquals(pair.first,ru); - } - } - - } - - /** - * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if - * they match with read context - */ - @Test(enabled = true) - public void testManyObservations() { - final int NUM_UNITS = 10; - final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; - final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; - final int NUM_TEST_CASES = 100; - - Random random = new Random(); - - for (int r = 0; r < NUM_TEST_CASES; r++) { - final StringBuilder sb = new StringBuilder(); - // for each unit, generate a repeat unit at random with given random length - final ArrayList repeatUnits = new ArrayList(); - final ArrayList numsRepetitions = new ArrayList(); - for (int n=0; n < NUM_UNITS; n++) { - final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); - final String repeatUnit = getRandomBases(repLength); - final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); - - // log for comparison with covariate - numsRepetitions.add(numRepetitions); - repeatUnits.add(repeatUnit); - - for (int k=0; k < numRepetitions; k++) - sb.append(repeatUnit); - - } - - final String readBases = sb.toString(); - System.out.println(readBases); - final int readLength = readBases.length(); - - final byte[] readQuals = new byte[readLength]; - Arrays.fill(readQuals,(byte)30); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); - - Covariate[] requestedCovariates = new Covariate[3]; - requestedCovariates[0] = rlCovariate; - requestedCovariates[1] = ruCovariate; - requestedCovariates[2] = rurlCovariate; - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); - Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); - Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); - - for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read - // check RepeatLength - final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); - final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); - final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); - // check RepeatUnit - final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); - final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); - final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); - // check RepeatUnitAndLength - final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); - final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); - final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); - // check all 3 values are identical - Assert.assertEquals(rlValD,rlValI); - Assert.assertEquals(rlValM,rlValI); - Assert.assertEquals(ruValD,ruValI); - Assert.assertEquals(ruValM,ruValI); - Assert.assertEquals(rurlValD,rurlValI); - Assert.assertEquals(rurlValM,rurlValI); - - - int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); - int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); - Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); - } - - } - - - - - - - } - - /** - * Returns random bases of given length - * @param length required length - * @return given random string - */ - @Requires("length > 0") - String getRandomBases(final int length) { - byte[] bases = new byte[length]; - Random ran = new Random(); - for (int i=0; i < length; i++ ) { - int idx = ran.nextInt(4); - bases[i] = BaseUtils.baseIndexToSimpleBase(idx); - } - return new String(bases); - } - - -} diff --git a/protected/pom.xml b/protected/pom.xml new file mode 100644 index 000000000..8a9646438 --- /dev/null +++ b/protected/pom.xml @@ -0,0 +1,24 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-root + 3.0 + ../public/sting-root + + + sting-protected + pom + Sting Protected + + + gatk-protected + + + + ${project.basedir}/.. + + + diff --git a/protected/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/protected/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala deleted file mode 100644 index fdbd7ca1f..000000000 --- a/protected/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ /dev/null @@ -1,117 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.queue.pipeline.examples - -import org.testng.annotations.{DataProvider, Test} -import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} -import org.broadinstitute.sting.BaseTest - -class ExampleUnifiedGenotyperPipelineTest { - @Test(timeOut=36000000) - def testUnifiedGenotyper() { - val spec = new PipelineTestSpec - spec.name = "unifiedgenotyper" - spec.args = Array( - " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", - " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", - " -I " + BaseTest.publicTestDir + "exampleBAM.bam", - " -filter QD", - " -filterExpression 'QD < 2.0'").mkString - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } - - @DataProvider(name = "ugIntervals") - def getUnifiedGenotyperIntervals = - Array( - Array("gatk_intervals", BaseTest.validationDataLocation + "intervalTest.intervals"), - Array("bed_intervals", BaseTest.validationDataLocation + "intervalTest.bed"), - Array("vcf_intervals", BaseTest.validationDataLocation + "intervalTest.1.vcf") - ).asInstanceOf[Array[Array[Object]]] - - @Test(dataProvider = "ugIntervals", timeOut=36000000) - def testUnifiedGenotyperWithIntervals(intervalsName: String, intervalsPath: String) { - val spec = new PipelineTestSpec - spec.name = "unifiedgenotyper_with_" + intervalsName - spec.args = Array( - " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", - " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", - " -R " + BaseTest.hg18Reference, - " -L " + intervalsPath).mkString - spec.jobRunners = Seq("Lsf706") - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testUnifiedGenotyperNoGCOpt() { - val spec = new PipelineTestSpec - spec.name = "unifiedgenotyper_no_gc_opt" - spec.args = Array( - " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", - " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", - " -I " + BaseTest.publicTestDir + "exampleBAM.bam", - " -noGCOpt").mkString - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } - - @DataProvider(name="resMemReqParams") - def getResMemReqParam = Array(Array("mem_free"), Array("virtual_free")).asInstanceOf[Array[Array[Object]]] - - @Test(dataProvider = "resMemReqParams", timeOut=36000000) - def testUnifiedGenotyperResMemReqParam(reqParam: String) { - val spec = new PipelineTestSpec - spec.name = "unifiedgenotyper_" + reqParam - spec.args = Array( - " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", - " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", - " -I " + BaseTest.publicTestDir + "exampleBAM.bam", - " -resMemReqParam " + reqParam).mkString - spec.jobRunners = Seq("GridEngine") - PipelineTest.executeTest(spec) - } -} diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml new file mode 100644 index 000000000..9c05867a8 --- /dev/null +++ b/public/external-example/pom.xml @@ -0,0 +1,267 @@ + + 4.0.0 + + + org.mycompany.app + external-example + jar + 1.0-SNAPSHOT + GATK External Example + + + 3.0 + + ../.. + UTF-8 + UTF-8 + yyyy/MM/dd HH:mm:ss + + + true + ${sting.committests.skipped} + ${sting.committests.skipped} + + + package + + + + + sting.public.repo.local + Sting Public Local Repository + file:${sting.basedir}/public/repo + + + + + + org.broadinstitute.sting + gatk-framework + ${sting.version} + + + + org.broadinstitute.sting + gatk-framework + ${sting.version} + test-jar + test + + + + org.testng + testng + 6.8 + test + + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 2.8 + + + unpack + process-resources + + unpack + + + + + org.broadinstitute.sting + gatk-framework + ${sting.version} + example-resources + tar.bz2 + ${project.build.outputDirectory} + + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + extract-resource-bundle + + javadoc + + prepare-package + + org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet + + ${project.build.outputDirectory} + + org.broadinstitute.sting + + gatk-framework + ${sting.version} + + 2g + false + true + -build-timestamp "${maven.build.timestamp}" -absolute-version "${project.version}" -out ${project.build.outputDirectory}/StingText.properties + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.1 + + + ${sting.shade.phase} + + shade + + + true + + + + commons-logging:commons-logging + + ** + + + + org.broad:tribble + + ** + + + + org.broadinstitute:variant + + ** + + + + + + + org.broadinstitute.sting:gsalib:tar.gz:* + org.broadinstitute.sting:*:tar.bz2:example-resources + + + + + + org.broadinstitute.sting.gatk.CommandLineGATK + + + + StingText.properties + + + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.16 + + + true + false + + + ${java.io.tmpdir} + + + + + + default-test + none + + + unit-tests + + test + + + ${sting.unittests.skipped} + + **/*UnitTest.class + + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.16 + + + true + false + + + ${java.io.tmpdir} + + + + + integration-tests + + integration-test + verify + + + + ${sting.integrationtests.skipped} + + **/*IntegrationTest.class + + + + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + none + + + + + diff --git a/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java new file mode 100644 index 000000000..d65c47c99 --- /dev/null +++ b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.mycompany.app; + +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; + +import java.io.PrintStream; + +/** + * An example walker that looks surprisingly like CountLoci. + */ +public class MyExampleWalker extends LocusWalker { + @Output + PrintStream out; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 1; + } + + public Long reduceInit() { return 0l; } + + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + public void onTraversalDone( Long c ) { + out.println(c); + } +} diff --git a/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java new file mode 100644 index 000000000..777079426 --- /dev/null +++ b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.mycompany.app; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.Collections; +import java.util.MissingResourceException; + +/** + * NOTE: Currently the testing infrastructure for walkers does not support running outside the Broad. + */ +public class MyExampleWalkerIntegrationTest extends WalkerTest { + @Test + public void testMyExampleWalker() throws URISyntaxException { + String gatk_args = String.format("-T MyExampleWalker -I %s -R %s", getResource("/exampleBAM.bam"), getResource("/exampleFASTA.fasta")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, Collections.emptyList()); + executeTest("Testing count on the example bam", spec); + } + + private File getResource(String path) throws URISyntaxException { + URL resourceUrl = getClass().getResource(path); + if (resourceUrl == null) + throw new MissingResourceException("Resource not found: " + path, getClass().getSimpleName(), path); + return new File(resourceUrl.toURI()); + } +} diff --git a/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerUnitTest.java b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerUnitTest.java new file mode 100644 index 000000000..e3e0c81ea --- /dev/null +++ b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerUnitTest.java @@ -0,0 +1,41 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.mycompany.app; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * NOTE: Currently the testing infrastructure for walkers does not support running outside the Broad. + */ +public class MyExampleWalkerUnitTest extends BaseTest { + @Test + public void testMyExampleWalker() { + MyExampleWalker walker = new MyExampleWalker(); + Assert.assertEquals((long)walker.reduce(1, 1L), 2L); + } +} diff --git a/public/gatk-framework/pom.xml b/public/gatk-framework/pom.xml new file mode 100644 index 000000000..942e59630 --- /dev/null +++ b/public/gatk-framework/pom.xml @@ -0,0 +1,126 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + gatk-framework + jar + GATK Framework + + + ${project.basedir}/../.. + gatk-package + + + + + ${project.groupId} + sting-utils + ${project.version} + + + + org.testng + testng + test + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + ${sting.generate-resources.phase} + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-resource-bundle-log4j + prepare-package + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + prepare-package + + + + + org.apache.maven.plugins + maven-invoker-plugin + + + package-unittests + + + package-integrationtests + + + package-largescaletests + + + package-knowledgebasetests + + + package-pipelinetests + + + + + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + + + + + + + diff --git a/public/gatk-framework/src/main/assembly/example-resources.xml b/public/gatk-framework/src/main/assembly/example-resources.xml new file mode 100644 index 000000000..b285cc05f --- /dev/null +++ b/public/gatk-framework/src/main/assembly/example-resources.xml @@ -0,0 +1,37 @@ + + example-resources + + tar.bz2 + + false + + + ${project.build.sourceDirectory}/org/broadinstitute/sting/gatk/walkers/qc + . + + Pileup.java + CountLoci.java + CountReads.java + CheckPileup.java + + + + ${project.build.sourceDirectory}/org/broadinstitute/sting/gatk/walkers/readutils + . + + PrintReads.java + + + + src/test/resources + . + + exampleBAM.bam + exampleBAM.bam.bai + exampleFASTA.fasta + exampleFASTA.fasta.fai + exampleFASTA.dict + + + + diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/gatk-framework/src/main/java/net/sf/samtools/GATKBAMFileSpan.java similarity index 100% rename from public/java/src/net/sf/samtools/GATKBAMFileSpan.java rename to public/gatk-framework/src/main/java/net/sf/samtools/GATKBAMFileSpan.java diff --git a/public/java/src/net/sf/samtools/GATKBin.java b/public/gatk-framework/src/main/java/net/sf/samtools/GATKBin.java similarity index 100% rename from public/java/src/net/sf/samtools/GATKBin.java rename to public/gatk-framework/src/main/java/net/sf/samtools/GATKBin.java diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/gatk-framework/src/main/java/net/sf/samtools/GATKChunk.java similarity index 100% rename from public/java/src/net/sf/samtools/GATKChunk.java rename to public/gatk-framework/src/main/java/net/sf/samtools/GATKChunk.java diff --git a/public/java/src/net/sf/samtools/PicardNamespaceUtils.java b/public/gatk-framework/src/main/java/net/sf/samtools/PicardNamespaceUtils.java similarity index 100% rename from public/java/src/net/sf/samtools/PicardNamespaceUtils.java rename to public/gatk-framework/src/main/java/net/sf/samtools/PicardNamespaceUtils.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/Aligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Aligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/Aligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Aligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/Alignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Alignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/Alignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Alignment.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/CheckAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/CheckAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAAligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAAligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWTFiles.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWTFiles.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWT.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWT.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Bases.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Bases.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Counts.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Counts.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Advanced.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Advanced.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Advanced.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Advanced.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Argument.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Argument.java new file mode 100644 index 000000000..96731584b --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Argument.java @@ -0,0 +1,125 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import java.lang.annotation.*; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Mar 24, 2009 + * Time: 11:11:36 AM + */ +/** + * Annotates fields in objects that should be used as command-line arguments. + * Any field annotated with @Argument can appear as a command-line parameter. + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.FIELD) +public @interface Argument { + /** + * The full name of the command-line argument. Full names should be + * prefixed on the command-line with a double dash (--). + * @return Selected full name, or "" to use the default. + */ + String fullName() default ""; + + /** + * Specified short name of the command. Short names should be prefixed + * with a single dash. Argument values can directly abut single-char + * short names or be separated from them by a space. + * @return Selected short name, or "" for none. + */ + String shortName() default ""; + + /** + * Documentation for the command-line argument. Should appear when the + * --help argument is specified. + * @return Doc string associated with this command-line argument. + */ + String doc() default "Undocumented option"; + + /** + * Is this argument required. If true, the command-line argument system will + * make a best guess for populating this argument based on the type descriptor, + * and will fail if the type can't be populated. + * @return True if the argument is required. False otherwise. + */ + boolean required() default true; + + /** + * Should this command-line argument be exclusive of others. Should be + * a comma-separated list of names of arguments of which this should be + * independent. + * @return A comma-separated string listing other arguments of which this + * argument should be independent. + */ + String exclusiveOf() default ""; + + /** + * Provide a regexp-based validation string. + * @return Non-empty regexp for validation, blank otherwise. + */ + String validation() default ""; + + /** + * Hard lower bound on the allowed value for the annotated argument -- generates an exception if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Hard lower bound on the allowed value for the annotated argument, or Double.NEGATIVE_INFINITY + * if there is none. + */ + double minValue() default Double.NEGATIVE_INFINITY; + + /** + * Hard upper bound on the allowed value for the annotated argument -- generates an exception if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Hard upper bound on the allowed value for the annotated argument, or Double.POSITIVE_INFINITY + * if there is none. + */ + double maxValue() default Double.POSITIVE_INFINITY; + + /** + * Soft lower bound on the allowed value for the annotated argument -- generates a warning if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Soft lower bound on the allowed value for the annotated argument, or Double.NEGATIVE_INFINITY + * if there is none. + */ + double minRecommendedValue() default Double.NEGATIVE_INFINITY; + + /** + * Soft upper bound on the allowed value for the annotated argument -- generates a warning if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Soft upper bound on the allowed value for the annotated argument, or Double.POSITIVE_INFINITY + * if there is none. + */ + double maxRecommendedValue() default Double.POSITIVE_INFINITY; +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentCollection.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinition.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinition.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitions.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitions.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentException.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentIOType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentIOType.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatch.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatch.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSite.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSite.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSource.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchValue.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatches.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatches.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentSource.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java new file mode 100644 index 000000000..9ab317251 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -0,0 +1,1028 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.walkers.Multiplex; +import org.broadinstitute.sting.gatk.walkers.Multiplexer; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.*; +import java.util.*; + +/** + * An descriptor capable of providing parsers that can parse any type + * of supported command-line argument. + * + * @author mhanna + * @version 0.1 + */ +public abstract class ArgumentTypeDescriptor { + private static Class[] ARGUMENT_ANNOTATIONS = {Input.class, Output.class, Argument.class}; + + /** + * our log, which we want to capture anything from org.broadinstitute.sting + */ + protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class); + + /** + * Fetch the given descriptor from the descriptor repository. + * @param descriptors the descriptors from which to select a good match. + * @param type Class for which to specify a descriptor. + * @return descriptor for the given type. + */ + public static ArgumentTypeDescriptor selectBest( Collection descriptors, Class type ) { + for( ArgumentTypeDescriptor descriptor: descriptors ) { + if( descriptor.supports(type) ) + return descriptor; + } + throw new ReviewedStingException("Can't process command-line arguments of type: " + type.getName()); + } + + /** + * Does this descriptor support classes of the given type? + * @param type The type to check. + * @return true if this descriptor supports the given type, false otherwise. + */ + public abstract boolean supports( Class type ); + + /** + * Returns false if a type-specific default can be employed. + * @param source Source of the command-line argument. + * @return True to throw in a type specific default. False otherwise. + */ + public boolean createsTypeDefault(ArgumentSource source) { return false; } + + /** + * Returns a documentation-friendly value for the default of a type descriptor. + * Must be overridden if createsTypeDefault return true. cannot be called otherwise + * @param source Source of the command-line argument. + * @return Friendly string of the default value, for documentation. If doesn't create a default, throws + * and UnsupportedOperationException + */ + public String typeDefaultDocString(ArgumentSource source) { + throw new UnsupportedOperationException(); + } + + /** + * Generates a default for the given type. + * + * @param parsingEngine the parsing engine used to validate this argument type descriptor. + * @param source Source of the command-line argument. + * @param type Type of value to create, in case the command-line argument system wants influence. + * @return A default value for the given type. + */ + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } + + /** + * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. + * @param source Source class and field for the given argument. + * @return A list of command-line argument definitions supporting this field. + */ + public List createArgumentDefinitions( ArgumentSource source ) { + return Collections.singletonList(createDefaultArgumentDefinition(source)); + } + + /** + * Parses an argument source to an object. + * WARNING! Mandatory side effect of parsing! Each parse routine should register the tags it finds with the proper CommandLineProgram. + * TODO: Fix this, perhaps with an event model indicating that a new argument has been created. + * + * @param parsingEngine The engine responsible for parsing. + * @param source The source used to find the matches. + * @param matches The matches for the source. + * @return The parsed object. + */ + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { + return parse(parsingEngine, source, source.field.getGenericType(), matches); + } + + /** + * Returns true if the field is a collection or an array. + * @param source The argument source to check. + * @return true if the field is a collection or an array. + */ + public boolean isMultiValued( ArgumentSource source ) { + Class argumentType = source.field.getType(); + return Collection.class.isAssignableFrom(argumentType) || argumentType.isArray(); + } + + /** + * By default, argument sources create argument definitions with a set of default values. + * Use this method to create the one simple argument definition. + * @param source argument source for which to create a default definition. + * @return The default definition for this argument source. + */ + protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { + Annotation argumentAnnotation = getArgumentAnnotation(source); + return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), + source.field.getType(), + ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), + ArgumentDefinition.getShortName(argumentAnnotation), + ArgumentDefinition.getDoc(argumentAnnotation), + source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), + source.isFlag(), + source.isMultiValued(), + source.isHidden(), + makeRawTypeIfNecessary(getCollectionComponentType(source.field)), + ArgumentDefinition.getExclusiveOf(argumentAnnotation), + ArgumentDefinition.getValidationRegex(argumentAnnotation), + getValidOptions(source) ); + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + protected Type getCollectionComponentType( Field field ) { + return null; + } + + /** + * Parses the argument matches for a class type into an object. + * @param source The original argument source used to find the matches. + * @param type The current class type being inspected. May not match the argument source.field.getType() if this as a collection for example. + * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. + * @return The individual parsed object matching the argument match with Class type. + */ + public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); + + /** + * If the argument source only accepts a small set of options, populate the returned list with + * those options. Otherwise, leave the list empty. + * @param source Original field specifying command-line arguments. + * @return A list of valid options. + */ + protected List getValidOptions( ArgumentSource source ) { + if(!source.field.getType().isEnum()) + return null; + List validOptions = new ArrayList(); + for(Object constant: source.field.getType().getEnumConstants()) + validOptions.add(constant.toString()); + return validOptions; + } + + /** + * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return true if the argument is present, or false if not present. + */ + protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + return true; + } + return false; + } + + /** + * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. + * If the argument matches multiple values, an exception will be thrown. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or null if not present. + */ + protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { + Collection argumentValues = getArgumentValues( definition, matches ); + if( argumentValues.size() > 1 ) + throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); + return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; + } + + /** + * Gets the tags associated with a given command-line argument. + * If the argument matches multiple values, an exception will be thrown. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or null if not present. + */ + protected Tags getArgumentTags(ArgumentMatches matches) { + Tags tags = new Tags(); + for(ArgumentMatch match: matches) { + if(!tags.isEmpty() && !match.tags.isEmpty()) + throw new ReviewedStingException("BUG: multiple conflicting sets of tags are available, and the type descriptor specifies no way of resolving the conflict."); + tags = match.tags; + } + return tags; + } + + /** + * Gets the values of an argument with the given full name, from the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or an empty collection if not present. + */ + protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { + Collection values = new ArrayList(); + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + values.addAll(match.values()); + } + return values; + } + + /** + * Retrieves the argument description from the given argument source. Will throw an exception if + * the given ArgumentSource + * @param source source of the argument. + * @return Argument description annotation associated with the given field. + */ + @SuppressWarnings("unchecked") + protected static Annotation getArgumentAnnotation( ArgumentSource source ) { + for (Class annotation: ARGUMENT_ANNOTATIONS) + if (source.field.isAnnotationPresent(annotation)) + return source.field.getAnnotation(annotation); + throw new ReviewedStingException("ArgumentAnnotation is not present for the argument field: " + source.field.getName()); + } + + /** + * Returns true if an argument annotation is present + * @param field The field to check for an annotation. + * @return True if an argument annotation is present on the field. + */ + @SuppressWarnings("unchecked") + public static boolean isArgumentAnnotationPresent(Field field) { + for (Class annotation: ARGUMENT_ANNOTATIONS) + if (field.isAnnotationPresent(annotation)) + return true; + return false; + } + + /** + * Returns true if the given annotation is hidden from the help system. + * @param field Field to test. + * @return True if argument should be hidden. False otherwise. + */ + public static boolean isArgumentHidden(Field field) { + return field.isAnnotationPresent(Hidden.class); + } + + public static Class makeRawTypeIfNecessary(Type t) { + if ( t == null ) + return null; + else if ( t instanceof ParameterizedType ) + return (Class)((ParameterizedType) t).getRawType(); + else if ( t instanceof Class ) { + return (Class)t; + } else { + throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); + } + } + + /** + * The actual argument parsing method. + * @param source source + * @param type type to check + * @param matches matches + * @param tags argument tags + * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. + */ + protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + return parseBinding(value, parameterType, type, name, tags, source.field.getName()); + } + + /** + * + * @param value The source of the binding + * @param parameterType The Tribble Feature parameter type + * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. + * @param bindingName The name of the binding passed to the constructor. + * @param tags Tags for the binding used for parsing and passed to the constructor. + * @param fieldName The name of the field that was parsed. Used for error reporting. + * @return The newly created binding object of type bindingClass. + */ + public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass, + String bindingName, Tags tags, String fieldName) { + try { + String tribbleType = null; + // must have one or two tag values here + if ( tags.getPositionalTags().size() > 2 ) { + throw new UserException.CommandLineException( + String.format("Unexpected number of positional tags for argument %s : %s. " + + "Rod bindings only support -X:type and -X:name,type argument styles", + value.asString(), fieldName)); + } else if ( tags.getPositionalTags().size() == 2 ) { + // -X:name,type style + bindingName = tags.getPositionalTags().get(0); + tribbleType = tags.getPositionalTags().get(1); + + FeatureManager manager = new FeatureManager(); + if ( manager.getByName(tribbleType) == null ) + throw new UserException.UnknownTribbleType( + tribbleType, + String.format("Unable to find tribble type '%s' provided on the command line. " + + "Please select a correct type from among the supported types:%n%s", + tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); + + } else { + // case with 0 or 1 positional tags + FeatureManager manager = new FeatureManager(); + + // -X:type style is a type when we cannot determine the type dynamically + String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; + if ( tag1 != null ) { + if ( manager.getByName(tag1) != null ) // this a type + tribbleType = tag1; + else + bindingName = tag1; + } + + if ( tribbleType == null ) { + // try to determine the file type dynamically + File file = value.asFile(); + if ( file.canRead() && file.isFile() ) { + FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); + if ( featureDescriptor != null ) { + tribbleType = featureDescriptor.getName(); + logger.debug("Dynamically determined type of " + file + " to be " + tribbleType); + } + } + + if ( tribbleType == null ) { + // IntervalBinding can be created from a normal String + Class rawType = (makeRawTypeIfNecessary(bindingClass)); + try { + return rawType.getConstructor(String.class).newInstance(value.asString()); + } catch (NoSuchMethodException e) { + /* ignore */ + } + + if ( ! file.exists() ) { + throw new UserException.CouldNotReadInputFile(file, "file does not exist"); + } else if ( ! file.canRead() || ! file.isFile() ) { + throw new UserException.CouldNotReadInputFile(file, "file could not be read"); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); + } + } + } + } + + Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); + return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags); + } catch (Exception e) { + if ( e instanceof UserException ) + throw ((UserException)e); + else + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, fieldName, e.getMessage())); + } + } + + /** + * Parse the source of a RodBindingCollection, which can be either a file of RodBindings or an actual RodBinding. + * + * @param parsingEngine the parsing engine used to validate this argument type descriptor + * @param source source + * @param type type + * @param matches matches + * @param tags argument tags + * @return the newly created binding object + */ + public Object parseRodBindingCollectionSource(final ParsingEngine parsingEngine, + final ArgumentSource source, + final Type type, + final ArgumentMatches matches, + final Tags tags) { + + final ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + final ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + // if this a list of files, get those bindings + final File file = value.asFile(); + try { + if (file.getAbsolutePath().endsWith(".list")) { + return getRodBindingsCollection(file, parsingEngine, parameterType, name, tags, source.field.getName()); + } + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + + // otherwise, treat this as an individual binding + final RodBinding binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, name, tags, source.field.getName()); + parsingEngine.addTags(binding, tags); + parsingEngine.addRodBinding(binding); + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, Arrays.asList(binding)); + } + + /** + * Retrieve and parse a collection of RodBindings from the given file. + * + * If the file contains duplicate entries or is empty, an exception will be thrown. + * + * @param file the source file + * @param parsingEngine the engine responsible for parsing + * @param parameterType the Tribble Feature parameter type + * @param bindingName the name of the binding passed to the constructor. + * @param defaultTags general tags for the binding used for parsing and passed to the constructor. + * @param fieldName the name of the field that was parsed. Used for error reporting. + * @return the newly created collection of binding objects. + */ + public static Object getRodBindingsCollection(final File file, + final ParsingEngine parsingEngine, + final Class parameterType, + final String bindingName, + final Tags defaultTags, + final String fieldName) throws IOException { + final List bindings = new ArrayList<>(); + + // Keep track of the files in this list so that we can check for duplicates and empty files + final Set fileValues = new HashSet<>(); + + // parse each line separately using the given Tags if none are provided on each line + for ( final String line: FileUtils.readLines(file) ) { + final String[] tokens = line.split("\\s+"); + final RodBinding binding; + + if ( tokens.length == 0 ) { + continue; // empty line, so do nothing + } + // use the default tags if none are provided for this binding + else if ( tokens.length == 1 ) { + final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[0], fileValues, fieldName, file.getName()); + binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, defaultTags, fieldName); + parsingEngine.addTags(binding, defaultTags); + + } + // use the new tags if provided + else if ( tokens.length == 2 ) { + final Tags tags = ParsingMethod.parseTags(fieldName, tokens[0]); + final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[1], fileValues, fieldName, file.getName()); + binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, tags, fieldName); + parsingEngine.addTags(binding, tags); + } else { + throw new UserException.BadArgumentValue(fieldName, "data lines should consist of an optional set of tags along with a path to a file; too many tokens are present for line: " + line); + } + + bindings.add(binding); + parsingEngine.addRodBinding(binding); + } + + if (fileValues.isEmpty()) { + throw new UserException.BadArgumentValue(fieldName, "The input list " + file.getName() + " is empty."); + } + + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, bindings); + } + + /** + * Validates the resource file name and constructs an ArgumentMatchValue from it. + * + * If the list name has already been processed in the current list, throws a UserException, otherwise + * creates an ArgumentMatchValue to represent the list. + * + * @param token Name of the ROD resource file. + * @param fileValues Set of names of ROD files that have already been processed. + * @param fieldName Name of the argument field being populated. + * @param listFileName Name of the list file being processed. + * @return + */ + private static ArgumentMatchValue parseAndValidateArgumentMatchValue(final String token, final Set fileValues, final String fieldName, + final String listFileName) { + checkForDuplicateFileName(token, fileValues, fieldName, listFileName); + return new ArgumentMatchStringValue(token); + } + + /** + * Checks to make sure that the current file name to be processed has not already been processed. + * + * Checks the name of the current file against the names that have already been processed, throwing + * an informative BadArgumentValue exception if it has already been seen. As a side effect adds the + * current file name to the set of filenames that have already been processed. + * + * @param currentFile Name of the current file to process + * @param processedFiles Set of file names that have already been processed + * @param fieldName Name of the argument that is being populated + * @param listName Filename of the list that is being processed + */ + protected static void checkForDuplicateFileName(final String currentFile, final Set processedFiles, + final String fieldName, final String listName) { + if (processedFiles.contains(currentFile)) { + throw new UserException.BadArgumentValue(fieldName, "The input list " + listName + " contains file " + currentFile + + " multiple times, which isn't allowed. If you are intentionally trying to " + + "include the same file more than once, you will need to specify it in separate file lists."); + } + processedFiles.add(currentFile); + } +} + +/** + * Parser for RodBinding objects + */ +class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBinding class objects + * @param type The type to check. + * @return true if the provided class is a RodBinding.class + */ + @Override + public boolean supports( Class type ) { + return isRodBinding(type); + } + + public static boolean isRodBinding( Class type ) { + return RodBinding.class.isAssignableFrom(type); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } + + @Override + @SuppressWarnings("unchecked") + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + return RodBinding.makeUnbound((Class)parameterType); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "none"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + Tags tags = getArgumentTags(matches); + RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); + parsingEngine.addTags(rbind, tags); + parsingEngine.addRodBinding(rbind); + return rbind; + } +} + +/** + * Parser for IntervalBinding objects + */ +class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want IntervalBinding class objects + * @param type The type to check. + * @return true if the provided class is an IntervalBinding.class + */ + @Override + public boolean supports( Class type ) { + return isIntervalBinding(type); + } + + public static boolean isIntervalBinding( Class type ) { + return IntervalBinding.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + return parseBinding(source, type, matches, getArgumentTags(matches)); + } +} + +/** + * Parser for RodBindingCollection objects + */ +class RodBindingCollectionArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBindingCollection class objects + * @param type The type to check. + * @return true if the provided class is an RodBindingCollection.class + */ + @Override + public boolean supports( final Class type ) { + return isRodBindingCollection(type); + } + + public static boolean isRodBindingCollection( final Class type ) { + return RodBindingCollection.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(final ParsingEngine parsingEngine, final ArgumentSource source, final Type type, final ArgumentMatches matches) { + final Tags tags = getArgumentTags(matches); + return parseRodBindingCollectionSource(parsingEngine, source, type, matches, tags); + } +} + +/** + * Parse simple argument types: java primitives, wrapper classes, and anything that has + * a simple String constructor. + */ +class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * @param type the class type + * @return true if this class is a binding type, false otherwise + */ + private boolean isBinding(final Class type) { + return RodBindingArgumentTypeDescriptor.isRodBinding(type) || + IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) || + RodBindingCollectionArgumentTypeDescriptor.isRodBindingCollection(type); + } + + + @Override + public boolean supports( Class type ) { + if ( isBinding(type) ) return false; + if ( type.isPrimitive() ) return true; + if ( type.isEnum() ) return true; + if ( primitiveToWrapperMap.containsValue(type) ) return true; + + try { + type.getConstructor(String.class); + return true; + } + catch( Exception ex ) { + // An exception thrown above means that the String constructor either doesn't + // exist or can't be accessed. In either case, this descriptor doesn't support this type. + return false; + } + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + if (source.isFlag()) + return true; + + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + Object result; + Tags tags = getArgumentTags(matches); + + // lets go through the types we support + try { + if (type.isPrimitive()) { + Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); + if(value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + result = valueOf.invoke(null,value.asString().trim()); + } else if (type.isEnum()) { + Object[] vals = type.getEnumConstants(); + Object defaultEnumeration = null; // as we look at options, record the default option if it exists + for (Object val : vals) { + if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; + try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } + catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } + } + // if their argument has no value (null), and there's a default, return that default for the enum value + if (defaultEnumeration != null && value == null) + result = defaultEnumeration; + // if their argument has no value and there's no default, throw a missing argument value exception. + // TODO: Clean this up so that null values never make it to this point. To fix this, we'll have to clean up the implementation of -U. + else if (value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + else + throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); + } else if (type.equals(File.class)) { + result = value == null ? null : value.asFile(); + } else { + Constructor ctor = type.getConstructor(String.class); + result = ctor.newInstance(value == null ? null : value.asString()); + } + } catch (UserException e) { + throw e; + } catch (InvocationTargetException e) { + throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", + value, source.field.getName())); + } catch (Exception e) { + throw new DynamicClassResolutionException(String.class, e); + } + + // TODO FIXME! + + // WARNING: Side effect! + parsingEngine.addTags(result,tags); + + return result; + } + + + /** + * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer + * this association available in the JRE? + */ + private static Map primitiveToWrapperMap = new HashMap() { + { + put( Boolean.TYPE, Boolean.class ); + put( Character.TYPE, Character.class ); + put( Byte.TYPE, Byte.class ); + put( Short.TYPE, Short.class ); + put( Integer.TYPE, Integer.class ); + put( Long.TYPE, Long.class ); + put( Float.TYPE, Float.class ); + put( Double.TYPE, Double.class ); + } + }; +} + +/** + * Process compound argument types: arrays, and typed and untyped collections. + */ +class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { + @Override + public boolean supports( Class type ) { + return ( Collection.class.isAssignableFrom(type) || type.isArray() ); + } + + @Override + @SuppressWarnings("unchecked") + public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + Type componentType; + Object result; + + if( Collection.class.isAssignableFrom(type) ) { + + // If this is a generic interface, pick a concrete implementation to create and pass back. + // Because of type erasure, don't worry about creating one of exactly the correct type. + if( Modifier.isInterface(type.getModifiers()) || Modifier.isAbstract(type.getModifiers()) ) + { + if( java.util.List.class.isAssignableFrom(type) ) type = ArrayList.class; + else if( java.util.Queue.class.isAssignableFrom(type) ) type = java.util.ArrayDeque.class; + else if( java.util.Set.class.isAssignableFrom(type) ) type = java.util.TreeSet.class; + } + + componentType = getCollectionComponentType( source.field ); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); + + Collection collection; + try { + collection = (Collection)type.newInstance(); + } + catch (InstantiationException e) { + logger.fatal("ArgumentParser: InstantiationException: cannot convert field " + source.field.getName()); + throw new ReviewedStingException("constructFromString:InstantiationException: Failed conversion " + e.getMessage()); + } + catch (IllegalAccessException e) { + logger.fatal("ArgumentParser: IllegalAccessException: cannot convert field " + source.field.getName()); + throw new ReviewedStingException("constructFromString:IllegalAccessException: Failed conversion " + e.getMessage()); + } + + for( ArgumentMatch match: matches ) { + for( ArgumentMatch value: match ) { + Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); + collection.add( object ); + // WARNING: Side effect! + parsingEngine.addTags(object,value.tags); + } + } + + result = collection; + + } + else if( type.isArray() ) { + componentType = type.getComponentType(); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); + + // Assemble a collection of individual values used in this computation. + Collection values = new ArrayList(); + for( ArgumentMatch match: matches ) + for( ArgumentMatch value: match ) + values.add(value); + + result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); + + int i = 0; + for( ArgumentMatch value: values ) { + Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); + Array.set(result,i++,object); + // WARNING: Side effect! + parsingEngine.addTags(object,value.tags); + } + } + else + throw new ReviewedStingException("Unsupported compound argument type: " + type); + + return result; + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + @Override + protected Type getCollectionComponentType( Field field ) { + // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length > 1 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return parameterizedType.getActualTypeArguments()[0]; + } + else + return String.class; + } +} + +class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * The multiplexer controlling how data is split. + */ + private final Multiplexer multiplexer; + + /** + * The set of identifiers for the multiplexed entries. + */ + private final Collection multiplexedIds; + + public MultiplexArgumentTypeDescriptor() { + this.multiplexer = null; + this.multiplexedIds = null; + } + + /** + * Private constructor to use in creating a closure of the MultiplexArgumentTypeDescriptor specific to the + * given set of multiplexed ids. + * @param multiplexedIds The collection of multiplexed entries + */ + private MultiplexArgumentTypeDescriptor(final Multiplexer multiplexer, final Collection multiplexedIds) { + this.multiplexer = multiplexer; + this.multiplexedIds = multiplexedIds; + } + + @Override + public boolean supports( Class type ) { + return ( Map.class.isAssignableFrom(type) ); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { + // Multiplexing always creates a type default. + return true; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { + if(multiplexer == null || multiplexedIds == null) + throw new ReviewedStingException("No multiplexed ids available"); + + Map multiplexedMapping = new HashMap(); + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); + ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); + + for(Object id: multiplexedIds) { + Object value = null; + if(componentTypeDescriptor.createsTypeDefault(source)) + value = componentTypeDescriptor.createTypeDefault(parsingEngine,source,componentType); + multiplexedMapping.put(id,value); + } + return multiplexedMapping; + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "None"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + if(multiplexedIds == null) + throw new ReviewedStingException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); + + Map multiplexedMapping = new HashMap(); + + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); + + + for(Object id: multiplexedIds) { + Object value = parsingEngine.selectBestTypeDescriptor(componentType).parse(parsingEngine,source,componentType,matches.transform(multiplexer,id)); + multiplexedMapping.put(id,value); + } + + parsingEngine.addTags(multiplexedMapping,getArgumentTags(matches)); + + return multiplexedMapping; + } + + public MultiplexArgumentTypeDescriptor createCustomTypeDescriptor(ParsingEngine parsingEngine,ArgumentSource dependentArgument,Object containingObject) { + String[] sourceFields = dependentArgument.field.getAnnotation(Multiplex.class).arguments(); + + List allSources = parsingEngine.extractArgumentSources(containingObject.getClass()); + Class[] sourceTypes = new Class[sourceFields.length]; + Object[] sourceValues = new Object[sourceFields.length]; + int currentField = 0; + + for(String sourceField: sourceFields) { + boolean fieldFound = false; + for(ArgumentSource source: allSources) { + if(!source.field.getName().equals(sourceField)) + continue; + if(source.field.isAnnotationPresent(Multiplex.class)) + throw new ReviewedStingException("Command-line arguments can only depend on independent fields"); + sourceTypes[currentField] = source.field.getType(); + sourceValues[currentField] = JVMUtils.getFieldValue(source.field,containingObject); + currentField++; + fieldFound = true; + } + if(!fieldFound) + throw new ReviewedStingException(String.format("Unable to find source field %s, referred to by dependent field %s",sourceField,dependentArgument.field.getName())); + } + + Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); + Constructor multiplexerConstructor; + try { + multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); + multiplexerConstructor.setAccessible(true); + } + catch(NoSuchMethodException ex) { + throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + + Multiplexer multiplexer; + try { + multiplexer = multiplexerConstructor.newInstance(sourceValues); + } + catch(IllegalAccessException ex) { + throw new ReviewedStingException(String.format("Constructor for class %s with parameters %s is inaccessible",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + catch(InstantiationException ex) { + throw new ReviewedStingException(String.format("Can't create class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + catch(InvocationTargetException ex) { + throw new ReviewedStingException(String.format("Can't invoke constructor of class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + + return new MultiplexArgumentTypeDescriptor(multiplexer,multiplexer.multiplex()); + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + @Override + protected Type getCollectionComponentType( Field field ) { + // Multiplex arguments must resolve to maps from which the clp should extract the second type. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length != 2 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return (Class)parameterizedType.getActualTypeArguments()[1]; + } + else + return String.class; + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ClassType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ClassType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ClassType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ClassType.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java new file mode 100644 index 000000000..8b1a390f4 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -0,0 +1,447 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.help.HelpFormatter; + +import java.io.IOException; +import java.util.*; + +public abstract class CommandLineProgram { + + /** The command-line program and the arguments it returned. */ + public ParsingEngine parser = null; + + /** + * Setting INFO gets you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging, and so on. + */ + @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false) + protected String logging_level = "INFO"; + + /** + * File to save the logging output. + */ + @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false) + protected String toFile = null; + + /** + * This will produce a help message in the terminal with general usage information, listing available arguments + * as well as tool-specific information if applicable. + */ + @Argument(fullName = "help", shortName = "h", doc = "Generate the help message", required = false) + public Boolean help = false; + + /** + * Use this to check the version number of the GATK executable you are invoking. Note that the version number is + * always included in the output at the start of every run as well as any error message. + */ + @Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false) + public Boolean version = false; + + + /** our logging output patterns */ + private static final String patternString = "%-5p %d{HH:mm:ss,SSS} %C{1} - %m %n"; + + static { + /** + * The very first thing that any Sting application does is forces the JVM locale into US English, so that we don't have + * to think about number formatting issues. + */ + forceJVMLocaleToUSEnglish(); + // setup a basic log configuration + CommandLineUtils.configureConsoleLogging(); + } + + + /** + * Allows a given application to return a brief description of itself. + * + * @return An ApplicationDetails object describing the current application. Should not be null. + */ + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(ApplicationDetails.createDefaultHeader(getClass()), + Collections.emptyList(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Collections.emptyList(); + } + + /** + * Will this application want to vary its argument list dynamically? + * If so, parse the command-line options and then prompt the subclass to return + * a list of argument providers. + * + * @return Whether the application should vary command-line arguments dynamically. + */ + protected boolean canAddArgumentsDynamically() { return false; } + + /** + * Provide a list of object to inspect, looking for additional command-line arguments. + * + * @return A list of objects to inspect. + */ + protected Class[] getArgumentSources() { + return new Class[]{}; + } + + /** + * Name this argument source. Provides the (full) class name as a default. + * + * @param source The argument source. + * + * @return a name for the argument source. + */ + protected String getArgumentSourceName( Class source ) { return source.toString(); } + + /** + * Sets the command-line parsing engine. Necessary for unit testing purposes. + * @param parser the new command-line parsing engine + */ + public void setParser( ParsingEngine parser ) { + this.parser = parser; + } + + /** + * this is the function that the inheriting class can expect to have called + * when all the argument processing is done + * + * @return the return code to exit the program with + * @throws Exception when an exception occurs + */ + protected abstract int execute() throws Exception; + + public static int result = -1; + + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args) throws Exception { + start(clp, args, false); + } + + /** + * This function is called to start processing the command line, and kick + * off the execute message of the program. + * + * @param clp the command line program to execute + * @param args the command line arguments passed in + * @param dryRun dry run + * @throws Exception when an exception occurs + */ + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { + + try { + // setup our log layout + PatternLayout layout = new PatternLayout(); + + Logger logger = CommandLineUtils.getStingLogger(); + + // now set the layout of all the loggers to our layout + CommandLineUtils.setLayout(logger, layout); + + // Initialize the logger using the defaults. + clp.setupLoggerLevel(layout); + + // setup the parser + ParsingEngine parser = clp.parser = new ParsingEngine(clp); + parser.addArgumentSource(clp.getClass()); + + Map parsedArgs; + + // process the args + if (clp.canAddArgumentsDynamically()) { + // if the command-line program can toss in extra args, fetch them and reparse the arguments. + parser.parse(args); + + // Allow invalid and missing required arguments to pass this validation step. + // - InvalidArgument in case these arguments are specified by plugins. + // - MissingRequiredArgument in case the user requested help. Handle that later, once we've + // determined the full complement of arguments. + if ( ! dryRun ) + parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, + ParsingEngine.ValidationType.InvalidArgument)); + parser.loadArgumentsIntoObject(clp); + + // Initialize the logger using the loaded command line. + clp.setupLoggerLevel(layout); + + Class[] argumentSources = clp.getArgumentSources(); + for (Class argumentSource : argumentSources) + parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); + parsedArgs = parser.parse(args); + + if (isVersionPresent(parser)) + printVersionAndExit(); + + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); + + if ( ! dryRun ) parser.validate(); + } else { + parsedArgs = parser.parse(args); + + if ( ! dryRun ) { + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); + + parser.validate(); + } + parser.loadArgumentsIntoObject(clp); + + // Initialize the logger using the loaded command line. + clp.setupLoggerLevel(layout); + } + + if ( ! dryRun ) { + // if they specify a log location, output our data there + if (clp.toFile != null) { + FileAppender appender; + try { + appender = new FileAppender(layout, clp.toFile, false); + logger.addAppender(appender); + } catch (IOException e) { + throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + } + } + + // regardless of what happens next, generate the header information + HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); + + // call the execute + CommandLineProgram.result = clp.execute(); + } + } + catch (ArgumentException e) { + //clp.parser.printHelp(clp.getApplicationDetails()); + // Rethrow the exception to exit with an error. + throw e; + } + } + + /** + * Find fields in the object obj that look like command-line arguments, and put command-line + * arguments into them. + * + * @param obj Object to inspect for command line arguments. + */ + public void loadArgumentsIntoObject(Object obj) { + parser.loadArgumentsIntoObject(obj); + } + + /** + * this function checks the logger level passed in on the command line, taking the lowest + * level that was provided. + * @param layout Pattern layout to format based on the logger level. + */ + private void setupLoggerLevel(PatternLayout layout) { + layout.setConversionPattern(patternString); + + // set the default logger level + Level par; + if (logging_level.toUpperCase().equals("DEBUG")) { + par = Level.DEBUG; + } else if (logging_level.toUpperCase().equals("INFO")) { + par = Level.INFO; + } else if (logging_level.toUpperCase().equals("WARN")) { + par = Level.WARN; + } else if (logging_level.toUpperCase().equals("ERROR")) { + par = Level.ERROR; + } else if (logging_level.toUpperCase().equals("FATAL")) { + par = Level.FATAL; + } else if (logging_level.toUpperCase().equals("OFF")) { + par = Level.OFF; + } else { + // we don't understand the logging level, let's get out of here + throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (DEBUG, INFO, WARN, ERROR, FATAL, OFF)"); + } + + Logger.getRootLogger().setLevel(par); + } + + /** + * a function used to indicate an error occurred in the command line tool + */ + private static void printDocumentationReference() { + errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); + errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); + } + + + /** + * Do a cursory search for the given argument. + * + * @param parser Parser + * + * @return True if help is present; false otherwise. + */ + private static boolean isHelpPresent(ParsingEngine parser) { + return parser.isArgumentPresent("help"); + } + + /** + * Print help and exit. + * + * @param clp Instance of the command-line program. + * @param parser True if help is present; false otherwise. + */ + private static void printHelpAndExit(CommandLineProgram clp, ParsingEngine parser) { + parser.printHelp(clp.getApplicationDetails()); + System.exit(0); + } + + /** + * Do a cursory search for the argument "version". + * + * @param parser Parser + * + * @return True if version is present; false otherwise. + */ + private static boolean isVersionPresent(ParsingEngine parser) { + return parser.isArgumentPresent("version"); + } + + /** + * Print help and exit. + */ + private static void printVersionAndExit() { + System.out.println(CommandLineGATK.getVersionNumber().toString()); + System.exit(0); + } + + + private static void errorPrintf(String format, Object... s) { + String formatted = String.format(format, s); + + if ( formatted.trim().equals("") ) + System.err.println("##### ERROR"); + else { + for ( String part : formatted.split("\n") ) { + System.err.println("##### ERROR " + part); + } + } + } + + + /** + * used to indicate an error occured + * + * @param msg the message + * @param t the error + */ + public static void exitSystemWithError(String msg, final Throwable t) { + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("stack trace %n"); + t.printStackTrace(); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); + errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); + printDocumentationReference(); + if ( msg == null ) // some exceptions don't have detailed messages + msg = "Code exception (see stack trace for error itself)"; + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", msg.trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + public static void exitSystemWithUserError(final Exception e) { + if ( e.getMessage() == null ) + throw new ReviewedStingException("UserException found with no message!", e); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A USER ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); + errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); + errorPrintf("%n"); + printDocumentationReference(); + errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + public static void exitSystemWithSamError(final Throwable t) { + if ( t.getMessage() == null ) + throw new ReviewedStingException("SamException found with no message!", t); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + printDocumentationReference(); + errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); + errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); + errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); + errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + + /** + * used to indicate an error occured + * + * @param t the exception that occurred + */ + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); + } + + /** + * A hack to ensure that numbers are always formatted in the US style. + */ + protected static void forceJVMLocaleToUSEnglish() { + Locale.setDefault(Locale.US); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java new file mode 100644 index 000000000..cb9a781c3 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java @@ -0,0 +1,192 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.apache.log4j.Appender; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.lang.annotation.Annotation; +import java.util.Collections; +import java.util.Enumeration; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Static utility methods for working with command-line arguments. + * + * @author mhanna + * @version 0.1 + */ +public class CommandLineUtils { + + /** + * Returns a key-value mapping of the command-line arguments passed into the GATK. + * Will be approximate; this class doesn't have all the required data to completely + * reconstruct the list of command-line arguments from the given objects. + * + * @param parsingEngine The parsing engine + * @param argumentProviders The providers of command-line arguments. + * @return A key-value mapping of argument full names to argument values. Produces best string representation + * possible given the information available. + */ + public static Map getApproximateCommandLineArguments(ParsingEngine parsingEngine, Object... argumentProviders) { + return getApproximateCommandLineArguments(parsingEngine, false, argumentProviders); + } + + /** + * Returns a key-value mapping of the command-line arguments passed into the GATK. + * Will be approximate; this class doesn't have all the required data to completely + * reconstruct the list of command-line arguments from the given objects. + * + * @param parsingEngine The parsing engine + * @param skipObjectPointers Should we skip arguments whose values are pointers (and don't print nicely)? + * @param argumentProviders The providers of command-line arguments. + * @return A key-value mapping of argument full names to argument values. Produces best string representation + * possible given the information available. + */ + public static Map getApproximateCommandLineArguments(ParsingEngine parsingEngine, boolean skipObjectPointers, Object... argumentProviders) { + Map commandLineArguments = new LinkedHashMap(); + + for(Object argumentProvider: argumentProviders) { + Map argBindings = parsingEngine.extractArgumentBindings(argumentProvider); + for(Map.Entry elt: argBindings.entrySet()) { + Object argumentValue = elt.getValue(); + + String argumentValueString = argumentValue != null ? argumentValue.toString() : null; + if ( skipObjectPointers && isObjectPointer(argumentValueString) ) + continue; + + for(ArgumentDefinition definition: elt.getKey().createArgumentDefinitions()) { + String argumentName = definition.fullName; + commandLineArguments.put(argumentName,argumentValueString); + } + } + } + + return commandLineArguments; + } + + /** + * Create an approximate list of command-line arguments based on the given argument providers. + * @param parsingEngine The parsing engine + * @param argumentProviders Argument providers to inspect. + * @return A string representing the given command-line arguments. + */ + public static String createApproximateCommandLineArgumentString(ParsingEngine parsingEngine, Object... argumentProviders) { + return createApproximateCommandLineArgumentString(parsingEngine, true, argumentProviders); + } + + /** + * Create an approximate list of command-line arguments based on the given argument providers. + * @param parsingEngine The parsing engine + * @param skipObjectPointers Should we skip arguments whose values are pointers (and don't print nicely)? + * @param argumentProviders Argument providers to inspect. + * @return A string representing the given command-line arguments. + */ + public static String createApproximateCommandLineArgumentString(ParsingEngine parsingEngine, boolean skipObjectPointers, Object... argumentProviders) { + Map commandLineArgs = getApproximateCommandLineArguments(parsingEngine, skipObjectPointers, argumentProviders); + StringBuffer sb = new StringBuffer(); + + boolean first = true; + for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) { + if ( !first ) + sb.append(" "); + sb.append(commandLineArg.getKey()); + sb.append("="); + sb.append(commandLineArg.getValue()); + first = false; + } + + return sb.toString(); + } + + /** + * A hack to get around the fact that Java doesn't like inheritance in Annotations. + * @param annotation to run the method on + * @param method the method to invoke + * @return the return value of the method + */ + public static Object getValue(Annotation annotation, String method) { + try { + return annotation.getClass().getMethod(method).invoke(annotation); + } catch (Exception e) { + throw new ReviewedStingException("Unable to access method " + method + " on annotation " + annotation.getClass(), e); + } + } + + // The problem here is that some of the fields being output are Objects - and those + // Objects don't overload toString() so that the output is just the memory pointer + // to the Object. Because those values are non-deterministic, they don't merge well + // into BAM/VCF headers (plus, it's just damn ugly). Perhaps there's a better way to + // do this, but at least this one works for the moment. + private static final String pointerRegexp = ".+@[0-9a-fA-F]+$"; + private static boolean isObjectPointer(String s) { + return s != null && s.matches(pointerRegexp); + } + + /** + * Returns the root logger for all Sting code. + * @return the root logger for all Sting code. + */ + public static Logger getStingLogger() { + return Logger.getLogger("org.broadinstitute.sting"); + } + + /** + * Enables console logging. + */ + @SuppressWarnings("unchecked") + public static void configureConsoleLogging() { + // Check to see if a console logger has already been enabled. + for (Logger logger = getStingLogger(); logger != null; logger = (Logger)logger.getParent()) { + Enumeration e = (Enumeration) logger.getAllAppenders(); + for (Appender appender: Collections.list(e)) { + if (appender instanceof ConsoleAppender) + return; + } + } + // Extracted from BasicConfigurator.configure(), but only applied to the Sting logger. + Logger.getRootLogger().addAppender(new ConsoleAppender( + new PatternLayout(PatternLayout.TTCC_CONVERSION_PATTERN), ConsoleAppender.SYSTEM_ERR)); + } + + /** + * Sets the layout of the logger. + * @param logger The logger. + * @param layout The layout. + */ + @SuppressWarnings("unchecked") + public static void setLayout(Logger logger, PatternLayout layout) { + for (; logger != null; logger = (Logger)logger.getParent()) { + Enumeration e = (Enumeration) logger.getAllAppenders(); + for (Appender appender: Collections.list(e)) + appender.setLayout(layout); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Gather.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gather.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Gather.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gather.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Gatherer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gatherer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Gatherer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gatherer.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Hidden.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Hidden.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Hidden.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Hidden.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Input.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Input.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Input.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Input.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java new file mode 100644 index 000000000..d2a1735fb --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java @@ -0,0 +1,85 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; + +import java.util.List; + +public class IntervalArgumentCollection { + /** + * Use this option to perform the analysis over only part of the genome. This argument can be specified multiple times. + * You can use samtools-style intervals either explicitly on the command line (e.g. -L chr1 or -L chr1:100-200) or + * by loading in a file containing a list of intervals (e.g. -L myFile.intervals). + * + * Additionally, you can also specify a ROD file (such as a VCF file) in order to perform the analysis at specific + * positions based on the records present in the file (e.g. -L file.vcf). + * + * Finally, you can also use this to perform the analysis on the reads that are completely unmapped in the BAM file + * (i.e. those without a reference contig) by specifying -L unmapped. + */ + @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate", required = false) + public List> intervals = null; + + /** + * Use this option to exclude certain parts of the genome from the analysis (like -L, but the opposite). + * This argument can be specified multiple times. You can use samtools-style intervals either explicitly on the + * command line (e.g. -XL chr1 or -XL chr1:100-200) or by loading in a file containing a list of intervals + * (e.g. -XL myFile.intervals). + * + * Additionally, you can also specify a ROD file (such as a VCF file) in order to exclude specific + * positions from the analysis based on the records present in the file (e.g. -L file.vcf). + * */ + @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing", required = false) + public List> excludeIntervals = null; + + /** + * By default, the program will take the UNION of all intervals specified using -L and/or -XL. However, you can + * change this setting, for example if you want to take the INTERSECTION of the sets instead. E.g. to perform the + * analysis on positions for which there is a record in a VCF, but restrict this to just those on chromosome 20, + * you would do -L chr20 -L file.vcf -isr INTERSECTION. + */ + @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Set merging approach to use for combining interval inputs", required = false) + public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; + + /** + * By default, the program merges abutting intervals (i.e. intervals that are directly side-by-side but do not + * actually overlap) into a single continuous interval. However you can change this behavior if you want them to be + * treated as separate intervals instead. + */ + @Argument(fullName = "interval_merging", shortName = "im", doc = "Interval merging rule for abutting intervals", required = false) + public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; + + /** + * Use this to add padding to the intervals specified using -L and/or -XL. For example, '-L chr1:100' with a + * padding value of 20 would turn into '-L chr1:80-120'. This is typically used to add padding around exons when + * analyzing exomes. The general Broad exome calling pipeline uses 100 bp padding by default. + */ + @Argument(fullName = "interval_padding", shortName = "ip", doc = "Amount of padding (in bp) to add to each interval", required = false, minValue = 0) + public int intervalPadding = 0; +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalBinding.java new file mode 100644 index 000000000..de57de871 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Requires; +import org.broad.tribble.AbstractFeatureReader; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.FeatureReader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.util.*; + +/** + * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. + * + * The IntervalBinding is a formal GATK argument that bridges between a walker and + * the engine to construct intervals for traversal at runtime. The IntervalBinding can + * either be a RodBinding, a string of one interval, or a file with interval strings. + * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. + * + * Note that this class is immutable. + */ +public final class IntervalBinding { + + private RodBinding featureIntervals; + private String stringIntervals; + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + featureIntervals = new RodBinding<>(type, rawName, source, tribbleType, tags); + } + + @Requires({"intervalArgument != null"}) + public IntervalBinding(String intervalArgument) { + stringIntervals = intervalArgument; + } + + public String getSource() { + return ( featureIntervals != null ? featureIntervals.getSource() : stringIntervals ); + } + + public List getIntervals(final GenomeAnalysisEngine toolkit) { + return getIntervals(toolkit.getGenomeLocParser()); + } + + public List getIntervals(final GenomeLocParser genomeLocParser) { + List intervals; + + if ( featureIntervals != null ) { + intervals = new ArrayList<>(); + + // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files + + final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); + if ( codec instanceof ReferenceDependentFeatureCodec ) + ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); + try { + FeatureReader reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false); + for ( Feature feature : reader.iterator() ) + intervals.add(genomeLocParser.createGenomeLoc(feature)); + } catch (Exception e) { + throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e); + } + + } else { + intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); + } + + Collections.sort(intervals); + return intervals; + } + + public String toString() { + return getSource(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/MissingArgumentValueException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/MissingArgumentValueException.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Output.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Output.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Output.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedArgs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedArgs.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedListArgs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedListArgs.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngine.java new file mode 100644 index 000000000..ad64aaa1d --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -0,0 +1,829 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Requires; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.HelpFormatter; + +import java.io.File; +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.util.*; + +/** + * A parser for Sting command-line arguments. + */ +public class ParsingEngine { + + /** + * The loaded argument sources along with their back definitions. + */ + private Map argumentSourcesByDefinition = new HashMap(); + + /** + * A list of defined arguments against which command lines are matched. + * Package protected for testing access. + */ + public ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); + + /** + * A list of matches from defined arguments to command-line text. + * Indicates as best as possible where command-line text remains unmatched + * to existing arguments. + */ + private ArgumentMatches argumentMatches = null; + + /** + * Techniques for parsing and for argument lookup. + */ + private List parsingMethods = new ArrayList(); + + /** + * All of the RodBinding objects we've seen while parsing + */ + private List rodBindings = new ArrayList(); + + /** + * Class reference to the different types of descriptors that the create method can create. + * The type of set used must be ordered (but not necessarily sorted). + */ + private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), + new IntervalBindingArgumentTypeDescriptor(), + new RodBindingArgumentTypeDescriptor(), + new RodBindingCollectionArgumentTypeDescriptor(), + new CompoundArgumentTypeDescriptor(), + new MultiplexArgumentTypeDescriptor()) ); + + private Set argumentTypeDescriptors = new LinkedHashSet(); + + /** + * List of tags associated with the given instantiation of the command-line argument. + */ + private final Map tags = new IdentityHashMap(); + + private PluginManager argumentProviderPluginManager = + new PluginManager(ParsingEngineArgumentProvider.class); + + /** + * our log, which we want to capture anything from org.broadinstitute.sting + */ + protected static Logger logger = Logger.getLogger(ParsingEngine.class); + + public ParsingEngine( CommandLineProgram clp ) { + RodBinding.resetNameCounter(); + parsingMethods.add( ParsingMethod.FullNameParsingMethod ); + parsingMethods.add( ParsingMethod.ShortNameParsingMethod ); + + // Order matters here! Make sure the clp's new type descriptors go in before the original type descriptors. + if(clp != null) + argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors()); + argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS); + + List> providers = argumentProviderPluginManager.getPlugins(); + for (Class provider: providers) { + addArgumentSource(provider); + } + } + + /** + * Add a main argument source. Argument sources are expected to have + * any number of fields with an @Argument annotation attached. + * @param source An argument source from which to extract command-line arguments. + */ + public void addArgumentSource( Class source ) { + addArgumentSource(null, source); + } + + public ArgumentMatches getArgumentMatches() { + return argumentMatches; + } + + /** + * Add an argument source. Argument sources are expected to have + * any number of fields with an @Argument annotation attached. + * @param sourceName name for this argument source. 'Null' indicates that this source should be treated + * as the main module. + * @param sourceClass A class containing argument sources from which to extract command-line arguments. + */ + public void addArgumentSource( String sourceName, Class sourceClass ) { + List argumentsFromSource = new ArrayList(); + for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) { + List argumentDefinitions = argumentSource.createArgumentDefinitions(); + for(ArgumentDefinition argumentDefinition: argumentDefinitions) { + argumentSourcesByDefinition.put(argumentDefinition,argumentSource); + argumentsFromSource.add( argumentDefinition ); + } + } + argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) ); + } + + /** + * Do a cursory search to see if an argument with the given name is present. + * @param argumentFullName full name of the argument. + * @return True if the argument is present. False otherwise. + */ + public boolean isArgumentPresent( String argumentFullName ) { + ArgumentDefinition definition = + argumentDefinitions.findArgumentDefinition(argumentFullName,ArgumentDefinitions.FullNameDefinitionMatcher); + return argumentMatches.hasMatch(definition); + + } + + /** + * Parse the given set of command-line arguments, returning + * an ArgumentMatches object describing the best fit of these + * command-line arguments to the arguments that are actually + * required. + * @param tokens Tokens passed on the command line. + * @return The parsed arguments by file. + */ + public SortedMap parse( String[] tokens ) { + argumentMatches = new ArgumentMatches(); + SortedMap parsedArgs = new TreeMap(); + + List cmdLineTokens = Arrays.asList(tokens); + parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs); + + List providers = argumentProviderPluginManager.createAllTypes(); + + for (ParsingEngineArgumentProvider provider: providers) { + // Load the arguments ONLY into the provider. + // Validation may optionally run on the rest of the arguments. + loadArgumentsIntoObject(provider); + } + + for (ParsingEngineArgumentProvider provider: providers) { + provider.parse(this, parsedArgs); + } + + return parsedArgs; + } + + public void parse(ArgumentMatchSource matchSource, List tokens, + ArgumentMatches argumentMatches, SortedMap parsedArgs) { + ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1); + + int i = 0; + for (String token: tokens) { + // If the token is of argument form, parse it into its own argument match. + // Otherwise, pair it with the most recently used argument discovered. + ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); + if( isArgumentForm(token) ) { + ArgumentMatch argumentMatch = parseArgument( token, site ); + if( argumentMatch != null ) { + argumentMatches.mergeInto( argumentMatch ); + lastArgumentMatchSite = site; + } + } + else { + if( argumentMatches.hasMatch(lastArgumentMatchSite) && + !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite)) + argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) ); + else + argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) ); + + } + i++; + } + + parsedArgs.put(matchSource, new ParsedListArgs(tokens)); + } + + public void parsePairs(ArgumentMatchSource matchSource, List> tokens, + ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs, + SortedMap parsedArgs) { + int i = 0; + for (Pair pair: tokens) { + + ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); + List matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher); + ArgumentDefinition definition = null; + for (DefinitionMatcher matcher: matchers) { + definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher ); + if (definition != null) + break; + } + if (definition == null) + continue; + ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags()); + argumentMatches.mergeInto(argumentMatch); + argumentMatch.addValue(site, pair.getSecond()); + i++; + } + + parsedArgs.put(matchSource, matchSourceArgs); + } + + protected List getArguments(File file) { + try { + if (file.getAbsolutePath().endsWith(".list")) { + return getListArguments(file); + } + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + throw new UserException.CouldNotReadInputFile(file, "file extension is not .list"); + } + + private List getListArguments(File file) throws IOException { + ArrayList argsList = new ArrayList(); + for (String line: FileUtils.readLines(file)) + argsList.addAll(Arrays.asList(Utils.escapeExpressions(line))); + return argsList; + } + + public enum ValidationType { MissingRequiredArgument, + InvalidArgument, + InvalidArgumentValue, + ValueMissingArgument, + TooManyValuesForArgument, + MutuallyExclusive } + + /** + * Validates the list of command-line argument matches. + */ + public void validate() { + validate( EnumSet.noneOf(ValidationType.class) ); + } + + /** + * Validates the list of command-line argument matches. On failure throws an exception with detailed info about the + * particular failures. Takes an EnumSet indicating which validation checks to skip. + * @param skipValidationOf List of validation checks to skip. + */ + public void validate( EnumSet skipValidationOf ) { + // Find missing required arguments. + if( !skipValidationOf.contains(ValidationType.MissingRequiredArgument) ) { + Collection requiredArguments = + argumentDefinitions.findArgumentDefinitions( true, ArgumentDefinitions.RequiredDefinitionMatcher ); + Collection missingArguments = new ArrayList(); + for( ArgumentDefinition requiredArgument: requiredArguments ) { + if( !argumentMatches.hasMatch(requiredArgument) ) + missingArguments.add( requiredArgument ); + } + + if( missingArguments.size() > 0 ) + throw new MissingArgumentException( missingArguments ); + } + + // Find invalid arguments. Invalid arguments will have a null argument definition. + if( !skipValidationOf.contains(ValidationType.InvalidArgument) ) { + ArgumentMatches invalidArguments = argumentMatches.findUnmatched(); + if( invalidArguments.size() > 0 ) + throw new InvalidArgumentException( invalidArguments ); + } + + // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression. + if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) { + Collection verifiableArguments = + argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher ); + Collection> invalidValues = new ArrayList>(); + for( ArgumentDefinition verifiableArgument: verifiableArguments ) { + ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument ); + // Check to see whether an argument value was specified. Argument values must be provided + // when the argument name is specified and the argument is not a flag type. + for(ArgumentMatch verifiableMatch: verifiableMatches) { + ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument); + if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault()) + invalidValues.add(new Pair(verifiableArgument,null)); + } + + // Ensure that the field contents meet the validation criteria specified by the regular expression. + for( ArgumentMatch verifiableMatch: verifiableMatches ) { + for( ArgumentMatchValue value: verifiableMatch.values() ) { + if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) ) + invalidValues.add( new Pair(verifiableArgument, value.asString()) ); + } + } + } + + if( invalidValues.size() > 0 ) + throw new InvalidArgumentValueException( invalidValues ); + } + + // Find values without an associated mate. + if( !skipValidationOf.contains(ValidationType.ValueMissingArgument) ) { + if( argumentMatches.MissingArgument.values().size() > 0 ) + throw new UnmatchedArgumentException( argumentMatches.MissingArgument ); + } + + // Find arguments with too many values. + if( !skipValidationOf.contains(ValidationType.TooManyValuesForArgument)) { + Collection overvaluedArguments = new ArrayList(); + for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { + // Warning: assumes that definition is not null (asserted by checks above). + if( !argumentMatch.definition.isMultiValued && argumentMatch.values().size() > 1 ) + overvaluedArguments.add(argumentMatch); + } + + if( !overvaluedArguments.isEmpty() ) + throw new TooManyValuesForArgumentException(overvaluedArguments); + } + + // Find sets of options that are supposed to be mutually exclusive. + if( !skipValidationOf.contains(ValidationType.MutuallyExclusive)) { + Collection> invalidPairs = new ArrayList>(); + for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { + if( argumentMatch.definition.exclusiveOf != null ) { + for( ArgumentMatch conflictingMatch: argumentMatches.findSuccessfulMatches() ) { + // Skip over the current element. + if( argumentMatch == conflictingMatch ) + continue; + if( argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.fullName) || + argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.shortName)) + invalidPairs.add( new Pair(argumentMatch, conflictingMatch) ); + } + } + } + + if( !invalidPairs.isEmpty() ) + throw new ArgumentsAreMutuallyExclusiveException( invalidPairs ); + } + } + + /** + * Loads a set of matched command-line arguments into the given object. + * @param object Object into which to add arguments. + */ + public void loadArgumentsIntoObject( Object object ) { + loadArgumentsIntoObject(object, true); + } + + /** + * Loads a set of matched command-line arguments into the given object. + * @param object Object into which to add arguments. + * @param enforceArgumentRanges If true, check that the argument value is within the range specified + * in the corresponding Argument annotation by min/max value attributes. This + * check is only performed for numeric types, and only when a min and/or + * max value is actually defined in the annotation. It is also only performed + * for values actually specified on the command line, and not for default values. + */ + public void loadArgumentsIntoObject( Object object, boolean enforceArgumentRanges ) { + List argumentSources = extractArgumentSources(object.getClass()); + + List dependentArguments = new ArrayList(); + + for( ArgumentSource argumentSource: argumentSources ) { + if(argumentSource.isDeprecated() && argumentMatches.findMatches(this,argumentSource).size() > 0) + notifyDeprecatedCommandLineArgument(argumentSource); + + // If this argument source depends on other command-line arguments, skip it and make a note to process it later. + if(argumentSource.isDependent()) { + dependentArguments.add(argumentSource); + continue; + } + loadValueIntoObject(argumentSource, object, argumentMatches.findMatches(this,argumentSource), enforceArgumentRanges); + } + + for(ArgumentSource dependentArgument: dependentArguments) { + MultiplexArgumentTypeDescriptor dependentDescriptor = dependentArgument.createDependentTypeDescriptor(this,object); + ArgumentSource dependentSource = dependentArgument.copyWithCustomTypeDescriptor(dependentDescriptor); + loadValueIntoObject(dependentSource,object,argumentMatches.findMatches(this,dependentSource), enforceArgumentRanges); + } + } + + /** + * Notify the user that tags have been created. + * @param key The key created. + * @param tags List of tags, or empty list if no tags are present. + */ + public void addTags(Object key, final Tags tags) { + this.tags.put(key,tags); + } + + /** + * Gets the tags associated with a given object. + * @param key Key for which to find a tag. + * @return List of tags associated with this key. + */ + public Tags getTags(Object key) { + if(!tags.containsKey(key)) + return new Tags(); + return tags.get(key); + } + + /** + * Add a RodBinding type argument to this parser. Called during parsing to allow + * us to track all of the RodBindings discovered in the command line. + * @param rodBinding the rodbinding to add. Must not be added twice + */ + @Requires("rodBinding != null") + public void addRodBinding(final RodBinding rodBinding) { + rodBindings.add(rodBinding); + } + + /** + * Notify the user that a deprecated command-line argument has been used. + * @param argumentSource Deprecated argument source specified by user. + */ + private void notifyDeprecatedCommandLineArgument(ArgumentSource argumentSource) { + // Grab the first argument definition and report that one as the failure. Theoretically, we should notify of all failures. + List definitions = argumentSource.createArgumentDefinitions(); + if(definitions.size() < 1) + throw new ReviewedStingException("Internal error. Argument source creates no definitions."); + ArgumentDefinition definition = definitions.get(0); + throw new UserException.DeprecatedArgument(definition.fullName,definition.doc); + } + + /** + * Loads a single argument into the object and that objects children. + * @param argumentMatches Argument matches to load into the object. + * @param source Argument source to load into the object. + * @param instance Object into which to inject the value. The target might be in a container within the instance. + * @param enforceArgumentRanges If true, check that the argument value is within the range specified + * in the corresponding Argument annotation by min/max value attributes. This + * check is only performed for numeric types, and only when a min and/or + * max value is actually defined in the annotation. It is also only performed + * for values actually specified on the command line, and not for default values. + */ + private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches, boolean enforceArgumentRanges ) { + // Nothing to load + if( argumentMatches.size() == 0 && ! source.createsTypeDefault() ) + return; + + // Target instance into which to inject the value. + Collection targets = findTargets( source, instance ); + + // Abort if no home is found for the object. + if( targets.size() == 0 ) + throw new ReviewedStingException("Internal command-line parser error: unable to find a home for argument matches " + argumentMatches); + + for( Object target: targets ) { + Object value; + boolean usedTypeDefault = false; + if ( argumentMatches.size() != 0 ) { + value = source.parse(this,argumentMatches); + } + else { + value = source.createTypeDefault(this); + usedTypeDefault = true; + } + + // Only check argument ranges if a check was requested AND we used a value from the command line rather + // than the type default + if ( enforceArgumentRanges && ! usedTypeDefault ) { + checkArgumentRange(source, value); + } + + JVMUtils.setFieldValue(source.field,target,value); + } + } + + /** + * Check the provided value against any range constraints specified in the Argument annotation + * for the corresponding field. Throw an exception if hard limits are violated, or emit a warning + * if soft limits are violated. + * + * Only checks numeric types (int, double, etc.) + * Only checks fields with an actual @Argument annotation + * Only checks manually-specified constraints (there are no default constraints). + * + * @param argumentSource The source field for the command-line argument + * @param argumentValue The value we're considering putting in that source field + */ + private void checkArgumentRange( final ArgumentSource argumentSource, final Object argumentValue ) { + // Only validate numeric types + if ( ! (argumentValue instanceof Number) ) { + return; + } + final double argumentDoubleValue = ((Number)argumentValue).doubleValue(); + + // Only validate fields with an @Argument annotation + final Annotation argumentAnnotation = argumentSource.field.getAnnotation(Argument.class); + if ( argumentAnnotation == null ) { + return; + } + + final double minValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minValue"); + final double maxValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxValue"); + final double minRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minRecommendedValue"); + final double maxRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxRecommendedValue"); + final String argumentName = (String)CommandLineUtils.getValue(argumentAnnotation, "fullName"); + + // Check hard limits first, if specified + if ( minValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minValue ) { + throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, minValue, "minimum"); + } + + if ( maxValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxValue ) { + throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, maxValue, "maximum"); + } + + // Then check soft limits, if specified + if ( minRecommendedValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minRecommendedValue ) { + logger.warn(String.format("WARNING: argument --%s has value %.2f, but minimum recommended value is %.2f", + argumentName, argumentDoubleValue, minRecommendedValue)); + } + + if ( maxRecommendedValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxRecommendedValue ) { + logger.warn(String.format("WARNING: argument --%s has value %.2f, but maximum recommended value is %.2f", + argumentName, argumentDoubleValue, maxRecommendedValue)); + } + } + + public Collection getRodBindings() { + return Collections.unmodifiableCollection(rodBindings); + } + + /** + * Gets a collection of the container instances of the given type stored within the given target. + * @param source Argument source. + * @param instance Container. + * @return A collection of containers matching the given argument source. + */ + private Collection findTargets(ArgumentSource source, Object instance) { + LinkedHashSet targets = new LinkedHashSet(); + for( Class clazz = instance.getClass(); clazz != null; clazz = clazz.getSuperclass() ) { + for( Field field: clazz.getDeclaredFields() ) { + if( field.equals(source.field) ) { + targets.add(instance); + } else if( field.isAnnotationPresent(ArgumentCollection.class) ) { + targets.addAll(findTargets(source, JVMUtils.getFieldValue(field, instance))); + } + } + } + return targets; + } + + /** + * Prints out the help associated with these command-line argument definitions. + * @param applicationDetails Details about the specific GATK-based application being run. + */ + public void printHelp( ApplicationDetails applicationDetails ) { + new HelpFormatter().printHelp(applicationDetails,argumentDefinitions); + } + + /** + * Extract all the argument sources from a given object. + * @param sourceClass class to act as sources for other arguments. + * @return A list of sources associated with this object and its aggregated objects. + */ + public List extractArgumentSources(Class sourceClass) { + return extractArgumentSources(sourceClass, new Field[0]); + } + + /** + * Fetch the best command-line argument descriptor for the given class. + * @param type Class for which to specify a descriptor. + * @return descriptor for the given type. + */ + public ArgumentTypeDescriptor selectBestTypeDescriptor(Class type) { + return ArgumentTypeDescriptor.selectBest(argumentTypeDescriptors,type); + } + + private List extractArgumentSources(Class sourceClass, Field[] parentFields) { + // now simply call into the truly general routine extract argument bindings but with a null + // object so bindings aren't computed + Map bindings = extractArgumentBindings(null, sourceClass, parentFields); + return new ArrayList(bindings.keySet()); + } + + public Map extractArgumentBindings(Object obj) { + if ( obj == null ) throw new IllegalArgumentException("Incoming object cannot be null"); + return extractArgumentBindings(obj, obj.getClass(), new Field[0]); + } + + /** + * Extract all the argument sources from a given object, along with their bindings if obj != null . + * @param obj the object corresponding to the sourceClass + * @param sourceClass class to act as sources for other arguments. + * @param parentFields Parent Fields + * @return A map of sources associated with this object and its aggregated objects and bindings to their bindings values + */ + private Map extractArgumentBindings(Object obj, Class sourceClass, Field[] parentFields) { + Map bindings = new LinkedHashMap(); + + while( sourceClass != null ) { + Field[] fields = sourceClass.getDeclaredFields(); + for( Field field: fields ) { + if( ArgumentTypeDescriptor.isArgumentAnnotationPresent(field) ) { + Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; + bindings.put( new ArgumentSource(parentFields, field, selectBestTypeDescriptor(field.getType())), val ); + } + if( field.isAnnotationPresent(ArgumentCollection.class) ) { + Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; + Field[] newParentFields = Arrays.copyOf(parentFields, parentFields.length + 1); + newParentFields[parentFields.length] = field; + bindings.putAll( extractArgumentBindings(val, field.getType(), newParentFields) ); + } + } + + sourceClass = sourceClass.getSuperclass(); + } + + return bindings; + } + + /** + * Determines whether a token looks like the name of an argument. + * @param token Token to inspect. Can be surrounded by whitespace. + * @return True if token is of short name form. + */ + private boolean isArgumentForm( String token ) { + for( ParsingMethod parsingMethod: parsingMethods ) { + if( parsingMethod.matches(token) ) + return true; + } + + return false; + } + + /** + * Parse a short name into an ArgumentMatch. + * @param token The token to parse. The token should pass the isLongArgumentForm test. + * @param position The position of the token in question. + * @return ArgumentMatch associated with this token, or null if no match exists. + */ + private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) { + if( !isArgumentForm(token) ) + throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token ); + + for( ParsingMethod parsingMethod: parsingMethods ) { + if( parsingMethod.matches( token ) ) + return parsingMethod.match( argumentDefinitions, token, position ); + } + + // No parse results found. + return null; + } +} + +/** + * An exception indicating that some required arguments are missing. + */ +class MissingArgumentException extends ArgumentException { + public MissingArgumentException( Collection missingArguments ) { + super( formatArguments(missingArguments) ); + } + + private static String formatArguments( Collection missingArguments ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentDefinition missingArgument: missingArguments ) { + if( missingArgument.shortName != null ) + sb.append( String.format("%nArgument with name '--%s' (-%s) is missing.", missingArgument.fullName, missingArgument.shortName) ); + else + sb.append( String.format("%nArgument with name '--%s' is missing.", missingArgument.fullName) ); + } + return sb.toString(); + } +} + +/** + * An exception for undefined arguments. + */ +class InvalidArgumentException extends ArgumentException { + public InvalidArgumentException( ArgumentMatches invalidArguments ) { + super( formatArguments(invalidArguments) ); + } + + private static String formatArguments( ArgumentMatches invalidArguments ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentMatch invalidArgument: invalidArguments ) + sb.append( String.format("%nArgument with name '%s' isn't defined.", invalidArgument.label) ); + return sb.toString(); + } +} + +/** + * An exception for values whose format is invalid. + */ +class InvalidArgumentValueException extends ArgumentException { + public InvalidArgumentValueException( Collection> invalidArgumentValues ) { + super( formatArguments(invalidArgumentValues) ); + } + + private static String formatArguments( Collection> invalidArgumentValues ) { + StringBuilder sb = new StringBuilder(); + for( Pair invalidValue: invalidArgumentValues ) { + if(invalidValue.getSecond() == null) + sb.append( String.format("%nArgument '--%s' requires a value but none was provided", + invalidValue.first.fullName) ); + else + sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", + invalidValue.first.fullName, + invalidValue.second, + invalidValue.first.validation) ); + } + return sb.toString(); + } +} + +class ArgumentValueOutOfRangeException extends ArgumentException { + public ArgumentValueOutOfRangeException( final String argumentName, final double argumentActualValue, + final double argumentBoundaryValue, final String argumentBoundaryType ) { + super(String.format("Argument --%s has value %.2f, but %s allowed value is %.2f", + argumentName, argumentActualValue, argumentBoundaryType, argumentBoundaryValue)); + } +} + +/** + * An exception for values that can't be mated with any argument. + */ +class UnmatchedArgumentException extends ArgumentException { + public UnmatchedArgumentException( ArgumentMatch invalidValues ) { + super( formatArguments(invalidValues) ); + } + + private static String formatArguments( ArgumentMatch invalidValues ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentMatchSite site: invalidValues.sites.keySet() ) + for( ArgumentMatchValue value: invalidValues.sites.get(site) ) { + switch (site.getSource().getType()) { + case CommandLine: + sb.append( String.format("%nInvalid argument value '%s' at position %d.", + value.asString(), site.getIndex()) ); + break; + case Provider: + sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.", + value.asString(), site.getSource().getDescription(), site.getIndex()) ); + break; + default: + throw new RuntimeException( String.format("Unexpected argument match source type: %s", + site.getSource().getType())); + } + if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString())) + sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace."); + } + return sb.toString(); + } +} + +/** + * An exception indicating that too many values have been provided for the given argument. + */ +class TooManyValuesForArgumentException extends ArgumentException { + public TooManyValuesForArgumentException( Collection arguments ) { + super( formatArguments(arguments) ); + } + + private static String formatArguments( Collection arguments ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentMatch argument: arguments ) + sb.append( String.format("%nArgument '%s' has too many values: %s.", argument.label, Arrays.deepToString(argument.values().toArray())) ); + return sb.toString(); + } +} + +/** + * An exception indicating that mutually exclusive options have been passed in the same command line. + */ +class ArgumentsAreMutuallyExclusiveException extends ArgumentException { + public ArgumentsAreMutuallyExclusiveException( Collection> arguments ) { + super( formatArguments(arguments) ); + } + + private static String formatArguments( Collection> arguments ) { + StringBuilder sb = new StringBuilder(); + for( Pair argument: arguments ) + sb.append( String.format("%nArguments '%s' and '%s' are mutually exclusive.", argument.first.definition.fullName, argument.second.definition.fullName ) ); + return sb.toString(); + } + +} + + +/** + * An exception for when an argument doesn't match an of the enumerated options for that var type + */ +class UnknownEnumeratedValueException extends ArgumentException { + public UnknownEnumeratedValueException(ArgumentDefinition definition, String argumentPassed) { + super( formatArguments(definition,argumentPassed) ); + } + + private static String formatArguments(ArgumentDefinition definition, String argumentPassed) { + return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingMethod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingMethod.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBinding.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBinding.java new file mode 100644 index 000000000..87fa85858 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBinding.java @@ -0,0 +1,197 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broad.tribble.Feature; + +import java.util.*; + +/** + * A RodBinding represents a walker argument that gets bound to a ROD track. + * + * The RodBinding is a formal GATK argument that bridges between a walker and + * the RefMetaDataTracker to obtain data about this rod track at runtime. The RodBinding + * is explicitly typed with type of the Tribble.Feature expected to be produced by this + * argument. The GATK Engine takes care of initializing the binding and connecting it + * to the RMD system. + * + * It is recommended that optional RodBindings be initialized to the value returned + * by the static method makeUnbound(). + * + * Note that this class is immutable. + */ +public final class RodBinding { + protected final static String UNBOUND_VARIABLE_NAME = ""; + protected final static String UNBOUND_SOURCE = "UNBOUND"; + protected final static String UNBOUND_TRIBBLE_TYPE = ""; + + /** + * Create an unbound Rodbinding of type. This is the correct programming + * style for an optional RodBinding + * + * At Input() + * RodBinding x = RodBinding.makeUnbound(T.class) + * + * The unbound binding is guaranteed to never match any binding. It uniquely + * returns false to isBound(). + * + * @param type the Class type produced by this unbound object + * @param any class extending Tribble Feature + * @return the UNBOUND RodBinding producing objects of type T + */ + @Requires("type != null") + protected final static RodBinding makeUnbound(Class type) { + return new RodBinding(type); + } + + /** The name of this binding. Often the name of the field itself, but can be overridden on cmdline */ + final private String name; + /** where the data for this ROD is coming from. A file or special value if coming from stdin */ + final private String source; + /** the string name of the tribble type, such as vcf, bed, etc. */ + final private String tribbleType; + /** The command line tags associated with this RodBinding */ + final private Tags tags; + /** The Java class expected for this RodBinding. Must correspond to the type emitted by Tribble */ + final private Class type; + /** True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments */ + final private boolean bound; + + /** + * The name counter. This is how we create unique names for collections of RodBindings + * on the command line. If you have provide the GATK with -X file1 and -X file2 to a + * RodBinding argument as List> then each binding will receive automatically + * the name of X and X2. + */ + final private static Map nameCounter = new HashMap(); + + /** for UnitTests */ + final public static void resetNameCounter() { + nameCounter.clear(); + } + + @Requires("rawName != null") + @Ensures("result != null") + final private static synchronized String countedVariableName(final String rawName) { + Integer count = nameCounter.get(rawName); + if ( count == null ) { + nameCounter.put(rawName, 1); + return rawName; + } else { + nameCounter.put(rawName, count + 1); + return rawName + (count + 1); + } + } + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public RodBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + this.type = type; + this.name = countedVariableName(rawName); + this.source = source; + this.tribbleType = tribbleType; + this.tags = tags; + this.bound = true; + } + + /** + * For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName + * @param type + * @param rawName + */ + public RodBinding(Class type, final String rawName) { + this(type, rawName, "missing", type.getSimpleName(), new Tags()); + } + + /** + * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object + * @param type class this unbound RodBinding creates + */ + @Requires({"type != null"}) + private RodBinding(Class type) { + this.type = type; + this.name = UNBOUND_VARIABLE_NAME; // special value can never be found in RefMetaDataTracker + this.source = UNBOUND_SOURCE; + this.tribbleType = UNBOUND_TRIBBLE_TYPE; + this.tags = new Tags(); + this.bound = false; + } + + + /** + * @return True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments + */ + final public boolean isBound() { + return bound; + } + + /** + * @return The name of this binding. Often the name of the field itself, but can be overridden on cmdline + */ + @Ensures({"result != null"}) + final public String getName() { + return name; + } + + /** + * @return the string name of the tribble type, such as vcf, bed, etc. + */ + @Ensures({"result != null"}) + final public Class getType() { + return type; + } + + /** + * @return where the data for this ROD is coming from. A file or special value if coming from stdin + */ + @Ensures({"result != null"}) + final public String getSource() { + return source; + } + + /** + * @return The command line tags associated with this RodBinding. Will include the tags used to + * determine the name and type of this RodBinding + */ + @Ensures({"result != null"}) + final public Tags getTags() { + return tags; + } + + /** + * @return The Java class expected for this RodBinding. Must correspond to the type emited by Tribble + */ + @Ensures({"result != null"}) + final public String getTribbleType() { + return tribbleType; + } + + @Override + public String toString() { + return String.format("(RodBinding name=%s source=%s)", getName(), getSource()); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBindingCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBindingCollection.java new file mode 100644 index 000000000..d8306ea5a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBindingCollection.java @@ -0,0 +1,89 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Ensures; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.*; + +/** + * A RodBindingCollection represents a collection of RodBindings. + * + * The RodBindingCollection is a formal GATK argument that is used to specify a file of RodBindings. + * + */ +public final class RodBindingCollection { + + /** The Java class expected for this RodBinding. Must correspond to the type emitted by Tribble */ + final private Class type; + + private Collection> rodBindings; + + public RodBindingCollection(final Class type, final Collection> rodBindings) { + this.type = type; + this.rodBindings = Collections.unmodifiableCollection(rodBindings); + } + + /** + * @return the collection of RodBindings + */ + final public Collection> getRodBindings() { + return rodBindings; + } + + /** + * @return the string name of the tribble type, such as vcf, bed, etc. + */ + @Ensures({"result != null"}) + final public Class getType() { + return type; + } + + @Override + public String toString() { + return String.format("(RodBindingCollection %s)", getRodBindings()); + } + + /** + * Utility method to help construct a RodBindingCollection of the given Feature type + * + * @param type the Feature type + * @param rodBindings the rod bindings to put into the collection + * @return a new RodBindingCollection object + */ + public static Object createRodBindingCollectionOfType(final Class type, final Collection rodBindings) { + try { + final Constructor ctor = RodBindingCollection.class.getConstructor(Class.class, Collection.class); + return ctor.newInstance(type, rodBindings); + } catch (final Exception e) { + throw new IllegalStateException("Failed to create a RodBindingCollection for type " + type); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/Tags.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Tags.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Tags.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Tags.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/package-info.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineExecutable.java new file mode 100644 index 000000000..86ecaffe0 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -0,0 +1,229 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; +import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; +import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.crypt.CryptUtils; +import org.broadinstitute.sting.utils.crypt.GATKKey; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.ListFileUtils; + +import java.security.PublicKey; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; + +/** + * @author aaron + */ +public abstract class CommandLineExecutable extends CommandLineProgram { + /** + * The actual engine which performs the analysis. + */ + protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // get the analysis name + public abstract String getAnalysisName(); + + /** + * Gets the GATK argument bundle. + * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. + */ + protected abstract GATKArgumentCollection getArgumentCollection(); + + /** + * A list of all the arguments initially used as sources. + */ + private final Collection argumentSources = new ArrayList(); + + protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + + /** + * this is the function that the inheriting class can expect to have called + * when the command line system has initialized. + * + * @return the return code to exit the program with + */ + protected int execute() throws Exception { + engine.setParser(parser); + argumentSources.add(this); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + + try { + // Make sure a valid GATK user key is present, if required. + authorizeGATKRun(); + + engine.setArguments(getArgumentCollection()); + + // File lists can require a bit of additional expansion. Set these explicitly by the engine. + final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser); + engine.setSAMFileIDs(bamFileList); + if(getArgumentCollection().showFullBamList){ + logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString())); + } + + engine.setWalker(walker); + walker.setToolkit(engine); + + Collection filters = engine.createFilters(); + engine.setFilters(filters); + + // load the arguments into the walker / filters. + // TODO: The fact that this extra load call exists here when all the parsing happens at the engine + // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive + // TODO: argument processing. + loadArgumentsIntoObject(walker); + argumentSources.add(walker); + + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + engine.setReferenceMetaDataFiles(rodBindings); + + for (ReadFilter filter: filters) { + loadArgumentsIntoObject(filter); + argumentSources.add(filter); + } + + engine.execute(); + generateGATKRunReport(walker); + } catch ( Exception e ) { + generateGATKRunReport(walker, e); + throw e; + } + + // always return 0 + return 0; + } + + /** + * Authorizes this run of the GATK by checking for a valid GATK user key, if required. + * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. + */ + private void authorizeGATKRun() { + if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || + getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { + if ( getArgumentCollection().gatkKeyFile == null ) { + throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + + "Please see " + UserException.PHONE_HOME_DOCS_URL + + " for more information and instructions on how to obtain a key."); + } + else { + PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); + + if ( ! gatkUserKey.isValid() ) { + throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); + } + } + } + } + + /** + * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. + * This report will be written to either STDOUT or to the run repository, depending on the options + * for -et. + * + * @param e the exception, can be null if no exception occurred + */ + private void generateGATKRunReport(Walker walker, Exception e) { + if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { + GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); + report.postReport(getArgumentCollection().phoneHomeType); + } + } + + /** + * Convenience method for fully parameterized generateGATKRunReport when an exception has + * not occurred + * + * @param walker + */ + private void generateGATKRunReport(Walker walker) { + generateGATKRunReport(walker, null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), + new SAMFileWriterArgumentTypeDescriptor(engine,System.out), + new OutputStreamArgumentTypeDescriptor(engine,System.out) ); + } + + /** + * GATK can add arguments dynamically based on analysis type. + * + * @return true + */ + @Override + protected boolean canAddArgumentsDynamically() { + return true; + } + + /** + * GATK provides the walker as an argument source. + * @return List of walkers to load dynamically. + */ + @Override + protected Class[] getArgumentSources() { + // No walker info? No plugins. + if (getAnalysisName() == null) return new Class[] {}; + + Collection argumentSources = new ArrayList(); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + engine.setArguments(getArgumentCollection()); + engine.setWalker(walker); + walker.setToolkit(engine); + argumentSources.add(walker.getClass()); + + Collection filters = engine.createFilters(); + for(ReadFilter filter: filters) + argumentSources.add(filter.getClass()); + + Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; + return argumentSources.toArray(argumentSourcesAsArray); + } + + @Override + protected String getArgumentSourceName( Class argumentSource ) { + return engine.getWalkerName((Class)argumentSource); + } + +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineGATK.java new file mode 100644 index 000000000..728fee5c8 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -0,0 +1,385 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import net.sf.picard.PicardException; +import net.sf.samtools.SAMException; +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.walkers.Attribution; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.*; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; + +import java.util.*; + +/** + * All command line parameters accepted by all tools in the GATK. + * + *

Info for general users

+ * + *

This is a list of options and parameters that are generally available to all tools in the GATK.

+ * + *

There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR + * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used + * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This + * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just + * skimming the one-line summaey in the table.

+ * + *

Info for developers

+ * + *

This class is the GATK engine itself, which manages map/reduce data access and runs walkers.

+ * + *

We run command line GATK programs using this class. It gets the command line args, parses them, and hands the + * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here; + * the GATK engine should deal with any data related information.

+ */ +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) +public class CommandLineGATK extends CommandLineExecutable { + /** + * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses) + * is available in the online documentation. + */ + @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run") + private String analysisName = null; + + // our argument collection, the collection of command line args we accept + @ArgumentCollection + private GATKArgumentCollection argCollection = new GATKArgumentCollection(); + + /** + * Get pleasing info about the GATK. + * + * @return A list of Strings that contain pleasant info about the GATK. + */ + @Override + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(createApplicationHeader(), + getAttribution(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + getAdditionalHelp()); + } + + @Override + public String getAnalysisName() { + return analysisName; + } + + @Override + protected GATKArgumentCollection getArgumentCollection() { + return argCollection; + } + + /** + * Required main method implementation. + */ + public static void main(String[] argv) { + try { + CommandLineGATK instance = new CommandLineGATK(); + start(instance, argv); + System.exit(CommandLineProgram.result); // todo -- this is a painful hack + } catch (UserException e) { + exitSystemWithUserError(e); + } catch (TribbleException e) { + // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are + // lazy loaded, so they aren't caught elsewhere and made into User Exceptions + exitSystemWithUserError(e); + } catch(PicardException e) { + // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? + exitSystemWithError(e); + } catch (SAMException e) { + checkForMaskedUserErrors(e); + exitSystemWithSamError(e); + } catch (OutOfMemoryError e) { + exitSystemWithUserError(new UserException.NotEnoughMemory()); + } catch (Throwable t) { + checkForMaskedUserErrors(t); + exitSystemWithError(t); + } + } + + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; + public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; + + private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error + final String message = t.getMessage(); + if ( message == null ) + return; + + // too many open files error + if ( message.contains("Too many open files") ) + exitSystemWithUserError(new UserException.TooManyOpenFiles()); + + // malformed BAM looks like a SAM file + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + exitSystemWithSamError(t); + + // can't close tribble index when writing + if ( message.contains("Unable to close index for") ) + exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); + + // disk is full + if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); + } + + /** + * Creates the a short blurb about the GATK, copyright info, and where to get documentation. + * + * @return The application header. + */ + public static List createApplicationHeader() { + List header = new ArrayList(); + header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); + header.add("Copyright (c) 2010 The Broad Institute"); + header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); + return header; + } + + public static String getVersionNumber() { + ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + return headerInfo.containsKey("org.broadinstitute.sting.gatk.version") ? headerInfo.getString("org.broadinstitute.sting.gatk.version") : ""; + } + + public static String getBuildTime() { + ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; + } + + /** + * If the user supplied any additional attribution, return it here. + * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. + */ + private List getAttribution() { + List attributionLines = new ArrayList(); + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(analysisName)) { + Class walkerType = walkerManager.getWalkerClassByName(analysisName); + if(walkerType.isAnnotationPresent(Attribution.class)) + attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); + } + return attributionLines; + } + + /** + * Retrieves additional information about GATK walkers. + * the code in HelpFormatter and supply it as a helper to this method. + * + * @return A string summarizing the walkers available in this distribution. + */ + private String getAdditionalHelp() { + String additionalHelp; + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(getAnalysisName())) + additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); + else + additionalHelp = getAllWalkerHelp(); + + return additionalHelp; + } + + private static final int PACKAGE_INDENT = 1; + private static final int WALKER_INDENT = 3; + private static final String FIELD_SEPARATOR = " "; + + private String getWalkerHelp(Class walkerType) { + // Construct a help string to output details on this walker. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + formatter.format("Available Reference Ordered Data types:%n"); + formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); + formatter.format("%n"); + + formatter.format("For a full description of this walker, see its GATKdocs at:%n"); + formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); + + return additionalHelp.toString(); + } + + /** + * Load in additional help information about all available walkers. + * @return A string representation of the additional help. + */ + private String getAllWalkerHelp() { + // Construct a help string to output available walkers. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + // Get the list of walker names from the walker manager. + WalkerManager walkerManager = engine.getWalkerManager(); + + // Build a list sorted by walker display name. As this information is collected, keep track of the longest + // package / walker name for later formatting. + SortedSet helpText = new TreeSet(new HelpEntryComparator()); + + int longestPackageName = 0; + int longestWalkerName = 0; + for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { + // Get the display name. + String packageName = walkersByPackage.getKey(); + String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); + String packageHelpText = walkerManager.getPackageSummaryText(packageName); + + // Compute statistics about which names is longest. + longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); + + SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); + for(Class walkerType: walkersByPackage.getValue()) { + String walkerName = walkerType.getName(); + String walkerDisplayName = walkerManager.getName(walkerType); + String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); + + longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); + + walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); + } + + // Dump the walkers into the sorted set. + helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); + } + + final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); + + + for(HelpEntry packageHelp: helpText) { + printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + for(HelpEntry walkerHelp: packageHelp.children) + printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + // Print a blank line between sets of walkers. + printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); + } + + return additionalHelp.toString(); + } + + private void printDescriptorLine(Formatter formatter, + int headerIndentWidth, + String header, + int headerWidth, + String fieldSeparator, + String description, + int lineWidth) { + final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; + final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; + List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); + + String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; + String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; + String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; + + // Output description line. + formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", + "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); + for(int i = 1; i < wordWrappedText.size(); i++) + formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); + } + +} + +/** + * Represents a given help entry; contains a display name, a summary and optionally some children. + */ +class HelpEntry { + public final String uid; + public final String displayName; + public final String summary; + public final SortedSet children; + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + * @param children children for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary, SortedSet children) { + this.uid = uid; + this.displayName = displayName; + this.summary = summary; + this.children = children; + } + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary) { + this(uid,displayName,summary,null); + } + +} + +/** + * Compare two help entries by display name. + */ +class HelpEntryComparator implements Comparator { + private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); + + /** + * Compares the order of lhs to rhs, not taking case into account. + * @param lhs First object to compare. + * @param rhs Second object to compare. + * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. + */ + public int compare(HelpEntry lhs, HelpEntry rhs) { + if(lhs == null && rhs == null) return 0; + if(lhs == null || lhs.displayName.equals("")) return 1; + if(rhs == null || rhs.displayName.equals("")) return -1; + return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); + } + + +} \ No newline at end of file diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java new file mode 100644 index 000000000..8df294b21 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -0,0 +1,1240 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import com.google.java.contract.Ensures; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.datasources.reads.*; +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.executive.MicroScheduler; +import org.broadinstitute.sting.gatk.filters.FilterManager; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; +import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.io.stubs.Stub; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.samples.SampleDB; +import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; +import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; +import java.util.concurrent.TimeUnit; + +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker; + +/** + * A GenomeAnalysisEngine that runs a specified walker. + */ +public class GenomeAnalysisEngine { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); + public static final long NO_RUNTIME_LIMIT = -1; + + /** + * The GATK command-line argument parsing code. + */ + private ParsingEngine parsingEngine; + + /** + * The genomeLocParser can create and parse GenomeLocs. + */ + private GenomeLocParser genomeLocParser; + + /** + * Accessor for sharded read data. + */ + private SAMDataSource readsDataSource = null; + + /** + * Accessor for sharded reference data. + */ + private ReferenceDataSource referenceDataSource = null; + + /** + * Accessor for sample metadata + */ + private SampleDB sampleDB = null; + + /** + * Accessor for sharded reference-ordered data. + */ + private List rodDataSources; + + // our argument collection + private GATKArgumentCollection argCollection; + + /** + * Collection of intervals used by the engine. + */ + private GenomeLocSortedSet intervals = null; + + /** + * Explicitly assign the interval set to use for this traversal (for unit testing purposes) + * @param intervals set of intervals to use for this traversal + */ + public void setIntervals( GenomeLocSortedSet intervals ) { + this.intervals = intervals; + } + + /** + * Collection of inputs used by the engine. + */ + private Map inputs = new HashMap(); + + /** + * Collection of outputs used by the engine. + */ + private Collection> outputs = new ArrayList>(); + + /** + * Collection of the filters applied to the input data. + */ + private Collection filters; + + /** + * Collection of the read transformers applied to the reads + */ + private List readTransformers; + + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + + private ReadMetrics cumulativeMetrics = null; + + /** + * A currently hacky unique name for this GATK instance + */ + private String myName = "GATK_" + Math.abs(getRandomGenerator().nextInt()); + + /** + * our walker manager + */ + private final WalkerManager walkerManager = new WalkerManager(); + + private Walker walker; + + public void setWalker(Walker walker) { + this.walker = walker; + } + + /** + * The short name of the current GATK walker as a string + * @return a non-null String + */ + public String getWalkerName() { + return getWalkerName(walker.getClass()); + } + + /** + * A processed collection of SAM reader identifiers. + */ + private Collection samReaderIDs = Collections.emptyList(); + + /** + * Set the SAM/BAM files over which to traverse. + * @param samReaderIDs Collection of ids to use during this traversal. + */ + public void setSAMFileIDs(Collection samReaderIDs) { + this.samReaderIDs = samReaderIDs; + } + + /** + * Collection of reference metadata files over which to traverse. + */ + private Collection referenceMetaDataFiles; + + /** + * The threading efficiency monitor we use in the GATK to monitor our efficiency. + * + * May be null if one isn't active, or hasn't be initialized yet + */ + private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * The global progress meter we are using to track our progress through the genome + */ + private ProgressMeter progressMeter = null; + + /** + * Set the reference metadata files to use for this traversal. + * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. + */ + public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { + this.referenceMetaDataFiles = referenceMetaDataFiles; + } + + /** + * The maximum runtime of this engine, in nanoseconds, set during engine initialization + * from the GATKArgumentCollection command line value + */ + private long runtimeLimitInNanoseconds = -1; + + /** + * Static random number generator and seed. + */ + private static final long GATK_RANDOM_SEED = 47382911L; + private static Random randomGenerator = new Random(GATK_RANDOM_SEED); + public static Random getRandomGenerator() { return randomGenerator; } + public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } + public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } + + /** + * Base Quality Score Recalibration helper object + */ + private BQSRArgumentSet bqsrArgumentSet = null; + public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } + public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } + public void setBaseRecalibration(final GATKArgumentCollection args) { + bqsrArgumentSet = new BQSRArgumentSet(args); + } + + /** + * Actually run the GATK with the specified walker. + * + * @return the value of this traversal. + */ + public Object execute() { + // first thing is to make sure the AWS keys can be decrypted + GATKRunReport.checkAWSAreValid(); + + //HeapSizeMonitor monitor = new HeapSizeMonitor(); + //monitor.start(); + setStartTime(new java.util.Date()); + + final GATKArgumentCollection args = this.getArguments(); + + // validate our parameters + if (args == null) { + throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); + } + + // validate our parameters + if (this.walker == null) + throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null."); + + if (args.nonDeterministicRandomSeed) + resetRandomGenerator(System.currentTimeMillis()); + + // if the use specified an input BQSR recalibration table then enable on the fly recalibration + if (args.BQSR_RECAL_FILE != null) + setBaseRecalibration(args); + + // setup the runtime limits + setupRuntimeLimits(args); + + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + + // Prepare the data for traversal. + initializeDataSources(); + + // initialize and validate the interval list + initializeIntervals(); + validateSuppliedIntervals(); + + // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary + validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); + + // initialize sampleDB + initializeSampleDB(); + + // our microscheduler, which is in charge of running everything + MicroScheduler microScheduler = createMicroscheduler(); + threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); + + // create temp directories as necessary + initializeTempDirectory(); + + // create the output streams + initializeOutputStreams(microScheduler.getOutputTracker()); + + // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on + logger.info("Preparing for traversal" + + (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + logger.info("Done preparing for traversal"); + + // execute the microscheduler, storing the results + return microScheduler.execute(this.walker, shardStrategy); + + //monitor.stop(); + //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); + + //return result; + } + + /** + * Retrieves an instance of the walker based on the walker name. + * + * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. + * @return An instance of the walker. + */ + public Walker getWalkerByName(String walkerName) { + try { + return walkerManager.createByName(walkerName); + } catch ( UserException e ) { + if ( isDeprecatedWalker(walkerName) ) { + e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); + } + throw e; + } + } + + /** + * Gets the name of a given walker type. + * @param walkerType Type of walker. + * @return Name of the walker. + */ + public String getWalkerName(Class walkerType) { + return walkerManager.getName(walkerType); + } + + public String getName() { + return myName; + } + + /** + * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; + * the caller must handle that directly. + * @return A collection of available filters. + */ + public Collection createFilters() { + final List filters = new LinkedList<>(); + + // First add the user requested filters + if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) + filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); + for(final String filterName: this.getArguments().readFilters) + filters.add(this.getFilterManager().createByName(filterName)); + + // now add the walker default filters. This ordering is critical important if + // users need to apply filters that fix up reads that would be removed by default walker filters + filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); + + return Collections.unmodifiableList(filters); + } + + /** + * Returns a list of active, initialized read transformers + * + * @param walker the walker we need to apply read transformers too + */ + public void initializeReadTransformers(final Walker walker) { + // keep a list of the active read transformers sorted based on priority ordering + List activeTransformers = new ArrayList(); + + final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); + final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; + + final PluginManager pluginManager = new PluginManager(ReadTransformer.class); + + for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { + transformer.initialize(overrideTime, this, walker); + if ( transformer.enabled() ) + activeTransformers.add(transformer); + } + + setReadTransformers(activeTransformers); + } + + public List getReadTransformers() { + return readTransformers; + } + + /* + * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). + * + * @param readTransformers the active read transformers + */ + protected void checkActiveReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new IllegalArgumentException("read transformers cannot be null"); + + ReadTransformer sawMustBeFirst = null; + ReadTransformer sawMustBeLast = null; + + for ( final ReadTransformer r : readTransformers ) { + if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { + if ( sawMustBeFirst != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); + sawMustBeFirst = r; + } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { + if ( sawMustBeLast != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); + sawMustBeLast = r; + } + } + } + + protected void setReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new ReviewedStingException("read transformers cannot be null"); + + // sort them in priority order + Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); + + // make sure we don't have an invalid set of active read transformers + checkActiveReadTransformers(readTransformers); + + this.readTransformers = readTransformers; + } + + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); + + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + argCollection.monitorThreadEfficiency); + } + + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + + + /** + * Allow subclasses and others within this package direct access to the walker manager. + * @return The walker manager used by this package. + */ + protected WalkerManager getWalkerManager() { + return walkerManager; + } + + /** + * setup a microscheduler + * + * @return a new microscheduler + */ + private MicroScheduler createMicroscheduler() { + // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. + if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && + this.getArguments().referenceFile == null) { + throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); + } + + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); + } + + protected DownsamplingMethod getDownsamplingMethod() { + GATKArgumentCollection argCollection = this.getArguments(); + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); + + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; + method.checkCompatibilityWithWalker(walker); + return method; + } + + protected void setDownsamplingMethod(DownsamplingMethod method) { + argCollection.setDownsamplingMethod(method); + } + + protected boolean includeReadsWithDeletionAtLoci() { + return walker.includeReadsWithDeletionAtLoci(); + } + + /** + * Verifies that the supplied set of reads files mesh with what the walker says it requires; + * also makes sure that list of SAM files specified on the command line is not empty and contains + * no duplicates. + */ + protected void validateSuppliedReads() { + GATKArgumentCollection arguments = this.getArguments(); + final Boolean samFilesArePresent = (arguments.samFiles != null && !arguments.samFiles.isEmpty()); + + // Check what the walker says is required against what was provided on the command line. + if (WalkerManager.isRequired(walker, DataSource.READS) && !samFilesArePresent) + throw new ArgumentException("Walker requires reads but none were provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (samFilesArePresent && !WalkerManager.isAllowed(walker, DataSource.READS)) + throw new ArgumentException("Walker does not allow reads but reads were provided."); + + //Make sure SAM list specified by the user (if necessary) is not empty + if(WalkerManager.isRequired(walker, DataSource.READS) && samFilesArePresent && samReaderIDs.isEmpty() ) { + throw new UserException("The list of input files does not contain any BAM files."); + } + + // Make sure no SAM files were specified multiple times by the user. + checkForDuplicateSamFiles(); + } + + /** + * Checks whether there are SAM files that appear multiple times in the fully unpacked list of + * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. + */ + protected void checkForDuplicateSamFiles() { + Set encounteredSamFiles = new HashSet(); + Set duplicateSamFiles = new LinkedHashSet(); + + for ( SAMReaderID samFile : samReaderIDs ) { + if ( encounteredSamFiles.contains(samFile) ) { + duplicateSamFiles.add(samFile.getSamFilePath()); + } + else { + encounteredSamFiles.add(samFile); + } + } + + if ( duplicateSamFiles.size() > 0 ) { + throw new UserException("The following BAM files appear multiple times in the list of input files: " + + duplicateSamFiles + " BAM files may be specified at most once."); + } + + } + + /** + * Verifies that the supplied reference file mesh with what the walker says it requires. + */ + protected void validateSuppliedReference() { + GATKArgumentCollection arguments = this.getArguments(); + // Check what the walker says is required against what was provided on the command line. + // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. + if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) + throw new ArgumentException("Walker requires a reference but none was provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) + throw new ArgumentException("Walker does not allow a reference but one was provided."); + } + + protected void validateSuppliedIntervals() { + // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. + if(!(walker instanceof ReadWalker)) { + GenomeLocSortedSet intervals = getIntervals(); + if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) + throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); + } + + // If intervals is non-null and empty at this point, it means that the list of intervals to process + // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since + // this was very likely unintentional, the user should be informed of this. Note that this is different + // from the case where intervals == null, which indicates that there were no interval arguments. + if ( intervals != null && intervals.isEmpty() ) { + logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); + } + + // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome + } + + /** + * Get the sharding strategy given a driving data source. + * + * @param readsDataSource readsDataSource + * @param drivingDataSource Data on which to shard. + * @param intervals intervals + * @return the sharding strategy + */ + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; + ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); + + if(walker instanceof LocusWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); + } + else + throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well + // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard + // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] + final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); + } + } + + protected boolean flashbackData() { + return walker instanceof ReadWalker; + } + + /** + * Create the temp directory if it doesn't exist. + */ + private void initializeTempDirectory() { + File tempDir = new File(System.getProperty("java.io.tmpdir")); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Unable to create directory"); + } + + /** + * Initialize the output streams as specified by the user. + * + * @param outputTracker the tracker supplying the initialization data. + */ + private void initializeOutputStreams(OutputTracker outputTracker) { + for (Map.Entry input : getInputs().entrySet()) + outputTracker.addInput(input.getKey(), input.getValue()); + for (Stub stub : getOutputs()) + outputTracker.addOutput(stub); + + outputTracker.prepareWalker(walker, getArguments().strictnessLevel); + } + + public ReferenceDataSource getReferenceDataSource() { + return referenceDataSource; + } + + public GenomeLocParser getGenomeLocParser() { + return genomeLocParser; + } + + /** + * Manage lists of filters. + */ + private final FilterManager filterManager = new FilterManager(); + + private Date startTime = null; // the start time for execution + + public void setParser(ParsingEngine parsingEngine) { + this.parsingEngine = parsingEngine; + } + + /** + * Explicitly set the GenomeLocParser, for unit testing. + * @param genomeLocParser GenomeLocParser to use. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + /** + * Sets the start time when the execute() function was last called + * @param startTime the start time when the execute() function was last called + */ + protected void setStartTime(Date startTime) { + this.startTime = startTime; + } + + /** + * @return the start time when the execute() function was last called + */ + public Date getStartTime() { + return startTime; + } + + /** + * Setup the intervals to be processed + */ + protected void initializeIntervals() { + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); + } + + /** + * Add additional, externally managed IO streams for inputs. + * + * @param argumentSource Field into which to inject the value. + * @param value Instance to inject. + */ + public void addInput(ArgumentSource argumentSource, Object value) { + inputs.put(argumentSource, value); + } + + /** + * Add additional, externally managed IO streams for output. + * + * @param stub Instance to inject. + */ + public void addOutput(Stub stub) { + outputs.add(stub); + } + + /** + * Returns the tag associated with a given command-line argument. + * @param key Object for which to inspect the tag. + * @return Tags object associated with the given key, or an empty Tag structure if none are present. + */ + public Tags getTags(Object key) { + return parsingEngine.getTags(key); + } + + protected void initializeDataSources() { + logger.info("Strictness is " + argCollection.strictnessLevel); + + validateSuppliedReference(); + setReferenceDataSource(argCollection.referenceFile); + + validateSuppliedReads(); + initializeReadTransformers(walker); + + readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); + + for (ReadFilter filter : filters) + filter.initialize(this); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); + } + + /** + * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or + * need to absolutely positively kill everyone in the room) + * @param dataSource + */ + public void setReadsDataSource(final SAMDataSource dataSource) { + this.readsDataSource = dataSource; + } + + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); + sampleDB = sampleDBBuilder.getFinalSampleDB(); + } + + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getReadsDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getReadsDataSource().getSAMFile(id); + } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). + * + * @param reads Reads data source. + * @param reference Reference data source. + * @param rods a collection of the reference ordered data tracks + */ + private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { + if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) + return; + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + if (!reads.isEmpty()) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + if (readsDictionary.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, + "reference", referenceDictionary, true, intervals); + } + + for (ReferenceOrderedDataSource rod : rods) + IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); + } + + /** + * Gets a data source for the given set of reads. + * + * @param argCollection arguments + * @param genomeLocParser parser + * @param refReader reader + * @return A data source for the given set of reads. + */ + private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); + + // Synchronize the method back into the collection so that it shows up when + // interrogating for the downsampling method during command line recreation. + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); + + if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) + throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); + + boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); + + if (argCollection.keepProgramRecords) + removeProgramRecords = false; + + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; + + final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? + loadSampleRenameMap(argCollection.sampleRenameMappingFile) : + null; + + return new SAMDataSource( + samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, + genomeLocParser, + argCollection.useOriginalBaseQualities, + argCollection.strictnessLevel, + argCollection.readBufferSize, + downsamplingMethod, + new ValidationExclusion(Arrays.asList(argCollection.unsafe)), + filters, + readTransformers, + includeReadsWithDeletionAtLoci(), + argCollection.defaultBaseQualities, + removeProgramRecords, + keepReadsInLIBS, + sampleRenameMap); + } + + /** + * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory + * HashMap. This file must consist of lines with two whitespace-separated fields: + * + * absolute_path_to_bam_file new_sample_name + * + * The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample + * renaming feature is being used. + * + * @param sampleRenameMapFile sample rename map file from which to load data + * @return a HashMap containing the contents of the map file, with the keys being the bam file paths and + * the values being the new sample names. + */ + protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { + logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); + + final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); + + try { + for ( final String line : new XReadLines(sampleRenameMapFile) ) { + final String[] tokens = line.split("\\s+"); + + if ( tokens.length != 2 ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", + tokens.length, line)); + } + + final File bamFile = new File(tokens[0]); + final String newSampleName = tokens[1]; + + if ( ! bamFile.isAbsolute() ) { + throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line); + } + + final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags()); + + if ( sampleRenameMap.containsKey(bamID) ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Bam file %s appears more than once", bamFile.getAbsolutePath())); + } + + sampleRenameMap.put(bamID, newSampleName); + } + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); + } + + return sampleRenameMap; + } + + + /** + * Opens a reference sequence file paired with an index. Only public for testing purposes + * + * @param refFile Handle to a reference sequence file. Non-null. + */ + public void setReferenceDataSource(File refFile) { + this.referenceDataSource = new ReferenceDataSource(refFile); + genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); + } + + /** + * Open the reference-ordered data sources. + * + * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. + * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. + * @param genomeLocParser to use when creating and validating GenomeLocs. + * @param validationExclusionType potentially indicate which validations to include / exclude. + * + * @return A list of reference-ordered data sources. + */ + private List getReferenceOrderedDataSources(Collection referenceMetaDataFiles, + SAMSequenceDictionary sequenceDictionary, + GenomeLocParser genomeLocParser, + ValidationExclusion.TYPE validationExclusionType) { + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, + getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); + + final List dataSources = new ArrayList(); + for (RMDTriplet fileDescriptor : referenceMetaDataFiles) + dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, + builder, + sequenceDictionary, + genomeLocParser, + flashbackData())); + + return dataSources; + } + + /** + * Returns the SAM File Header from the input reads' data source file + * @return the SAM File Header from the input reads' data source file + */ + public SAMFileHeader getSAMFileHeader() { + return readsDataSource.getHeader(); + } + + public boolean lenientVCFProcessing() { + return lenientVCFProcessing(argCollection.unsafe); + } + + public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { + return val == ValidationExclusion.TYPE.ALL + || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; + } + + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader or null if not available. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource == null ? null : readsDataSource.getHeader(reader); + } + + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * + * @return the reads data source + */ + public SAMDataSource getReadsDataSource() { + return this.readsDataSource; + } + + /** + * Sets the collection of GATK main application arguments. + * + * @param argCollection the GATK argument collection + */ + public void setArguments(GATKArgumentCollection argCollection) { + this.argCollection = argCollection; + } + + /** + * Gets the collection of GATK main application arguments. + * + * @return the GATK argument collection + */ + public GATKArgumentCollection getArguments() { + return this.argCollection; + } + + /** + * Get the list of intervals passed to the engine. + * @return List of intervals, or null if no intervals are in use + */ + public GenomeLocSortedSet getIntervals() { + return this.intervals; + } + + /** + * Get the list of regions of the genome being processed. If the user + * requested specific intervals, return those, otherwise return regions + * corresponding to the entire genome. Never returns null. + * + * @return a non-null set of intervals being processed + */ + @Ensures("result != null") + public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { + if ( getIntervals() == null ) + // if we don't have any intervals defined, create intervals from the reference itself + return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); + else + return getIntervals(); + } + + /** + * Gets the list of filters employed by this engine. + * @return Collection of filters (actual instances) used by this engine. + */ + public Collection getFilters() { + return this.filters; + } + + /** + * Sets the list of filters employed by this engine. + * @param filters Collection of filters (actual instances) used by this engine. + */ + public void setFilters(Collection filters) { + this.filters = filters; + } + + /** + * Gets the filter manager for this engine. + * @return filter manager for this engine. + */ + protected FilterManager getFilterManager() { + return filterManager; + } + + /** + * Gets the input sources for this engine. + * @return input sources for this engine. + */ + protected Map getInputs() { + return inputs; + } + + /** + * Gets the output stubs for this engine. + * @return output stubs for this engine. + */ + protected Collection> getOutputs() { + return outputs; + } + + /** + * Returns data source objects encapsulating all rod data; + * individual rods can be accessed through the returned data source objects. + * + * @return the rods data sources + */ + public List getRodDataSources() { + return this.rodDataSources; + } + + /** + * Gets cumulative metrics about the entire run to this point. + * Returns a clone of this snapshot in time. + * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is + * owned by the caller; the caller can do with the object what they wish. + */ + public ReadMetrics getCumulativeMetrics() { + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; + } + + /** + * Return the global ThreadEfficiencyMonitor, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- + + public SampleDB getSampleDB() { + return this.sampleDB; + } + + public Map getApproximateCommandLineArguments(Object... argumentProviders) { + return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); + } + + public String createApproximateCommandLineArgumentString(Object... argumentProviders) { + return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); + } + + // ------------------------------------------------------------------------------------- + // + // code for working with progress meter + // + // ------------------------------------------------------------------------------------- + + /** + * Register the global progress meter with this engine + * + * Calling this function more than once will result in an IllegalStateException + * + * @param meter a non-null progress meter + */ + public void registerProgressMeter(final ProgressMeter meter) { + if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); + if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); + + progressMeter = meter; + } + + /** + * Get the progress meter being used by this engine. May be null if no meter has been registered yet + * @return a potentially null pointer to the progress meter + */ + public ProgressMeter getProgressMeter() { + return progressMeter; + } + + /** + * Does the current runtime in unit exceed the runtime limit, if one has been provided? + * + * @return false if not limit was requested or if runtime <= the limit, true otherwise + */ + public boolean exceedsRuntimeLimit() { + if ( progressMeter == null ) + // not yet initialized or not set because of testing + return false; + + if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) + return false; + else { + final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); + if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); + final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); + return runtime > maxRuntimeNano; + } + } + + /** + * @return the runtime limit in nanoseconds, or -1 if no limit was specified + */ + public long getRuntimeLimitInNanoseconds() { + return runtimeLimitInNanoseconds; + } + + /** + * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds + * as appropriate + * + * @param args the GATKArgumentCollection to retrieve our runtime limits from + */ + private void setupRuntimeLimits(final GATKArgumentCollection args) { + if ( args.maxRuntime == NO_RUNTIME_LIMIT ) + runtimeLimitInNanoseconds = -1; + else if (args.maxRuntime < 0 ) + throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); + else { + runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadMetrics.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadMetrics.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadProperties.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadProperties.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/WalkerManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/WalkerManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java new file mode 100644 index 000000000..e86780eb4 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -0,0 +1,571 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.arguments; + +import net.sf.samtools.SAMFileReader; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * @author aaron + * @version 1.0 + */ +public class GATKArgumentCollection { + + /** the constructor */ + public GATKArgumentCollection() { + } + + // parameters and their defaults + /** + * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a + * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or + * BAM file. Please see our online documentation for more details on input formatting requirements. + */ + @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) + public List samFiles = new ArrayList(); + + @Hidden + @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") + public Boolean showFullBamList = false; + + @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) + public Integer readBufferSize = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic + * non-identifying statistics (which tool was used, whether the run was successful etc.) that help us for debugging + * and development. You can use this option to turn off reporting if your run environment is not connected to the + * internet or if your data is subject to stringent confidentiality clauses (e.g. clinical patient data). + * To do so you will need to request a key using the online request form on our website. + */ + @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) + public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; + /** + * Please see the online documentation FAQs for more details on the key system and how to request a key. + */ + @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false) + public File gatkKeyFile = null; + + /** + * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be + * used to group together runs during later analysis. One use of this capability is to tag runs as GATK + * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // General features + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately, + * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool + * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter + * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not + * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this + * is specified in each tool's documentation. The default filters cannot be disabled. + */ + @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false) + public final List readFilters = new ArrayList(); + + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); + /** + * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary + * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although + * this argument is indicated as being optional, almost all GATK tools require a reference in order to run. + * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs, + * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in + * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few + * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes. + */ + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) + public File referenceFile = null; + /** + * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically. + */ + @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false) + public boolean nonDeterministicRandomSeed = false; + /** + * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator. + */ + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.") + public boolean disableDithering = false; + /** + * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument. + */ + @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false) + public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; + + @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false) + public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; + + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. + * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools + * specify a default downsampling type and target, but this behavior can be overridden from command line using the + * downsampling arguments. + */ + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) + public DownsampleType downsamplingType = null; + /** + * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of + * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling + * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of + * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target + * coverage you need to aim for in order to obtain enough coverage in all loci of interest. + */ + @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0) + public Double downsampleFraction = null; + + /** + * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to + * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes + * unreasonable computational costs. The downsampling process takes two different forms depending on the type of + * analysis it is used with. + * + * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), + * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals + * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start + * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers + * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available + * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation + * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of + * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be + * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than + * requested. + */ + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", + doc = "Target coverage threshold for downsampling to coverage", + required = false, minValue = 0) + public Integer downsampleCoverage = null; + + /** + * Gets the downsampling method explicitly specified by the user. If the user didn't specify + * a default downsampling mechanism, return the default. + * @return The explicitly specified downsampling mechanism, or the default if none exists. + */ + public DownsamplingMethod getDownsamplingMethod() { + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) + return null; + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); + } + + /** + * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. + * @param method The downsampling mechanism. + */ + public void setDownsamplingMethod(DownsamplingMethod method) { + if (method == null) + throw new IllegalArgumentException("method is null"); + + downsamplingType = method.type; + downsampleCoverage = method.toCoverage; + downsampleFraction = method.toFraction; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // BAQ arguments + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) + public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; + /** + * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. + */ + @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) + public double BAQGOP = BAQ.DEFAULT_GOP; + + // -------------------------------------------------------------------------------------------------------------- + // + // quality encoding checking arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification. + * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the + * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should + * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are + * not in the correct encoding. + */ + @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) + public boolean FIX_MISENCODED_QUALS = false; + /** + * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly + * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know + * what you are doing; otherwise you could seriously mess up your data and ruin your analysis. + */ + @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false) + public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; + /** + * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which + * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ + * tag is present for a read, the standard qual score will be used. + */ + @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false) + public Boolean useOriginalBaseQualities = false; + /** + * If reads are missing some or all base quality scores, this value will be used for all base quality scores. + * By default this is set to -1 to disable default base quality assignment. + */ + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE) + public byte defaultBaseQualities = -1; + + // -------------------------------------------------------------------------------------------------------------- + // + // performance log arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files. + */ + @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false) + public File performanceLog = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // BQSR arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads + * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. + * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). + */ + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration") + public File BQSR_RECAL_FILE = null; + + /** + * Turns on the base quantization module. It requires a recalibration report (-BQSR). + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using that many levels. + * Negative values mean that we should quantize using the recalibration report's quantization level. + */ + @Hidden + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) + public int quantizationLevels = 0; + + /** + * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. + */ + @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false) + public boolean disableIndelQuals = false; + + /** + * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file. + * Note that this may results in significant file size increase. + */ + @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false) + public boolean emitOriginalQuals = false; + + /** + * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file. + * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. + * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, + * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, + * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. + */ + @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) + public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; + /** + * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. + */ + @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) + public double globalQScorePrior = -1.0; + + /** + * It is absolutely not recommended practice to run base quality score recalibration on BAM files that have been + * processed with ReduceReads. By default, the GATK will error out if it detects that you are trying to recalibrate + * a reduced BAM file. However, this flag allows you to disable the warning and proceed anyway. For the sake of your + * data, please only use this option if you really know what you are doing. + */ + @Advanced + @Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Ignore all warnings about how it's a really bad idea to run BQSR on a reduced BAM file (AT YOUR OWN RISK!)", required = false) + public boolean ALLOW_BQSR_ON_REDUCED_BAMS = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Other utility arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong. + */ + @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) + public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; + /** + * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header. + */ + @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false) + public boolean removeProgramRecords = false; + /** + * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header. + */ + @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false) + public boolean keepProgramRecords = false; + /** + * This option requires that each BAM file listed in the mapping file have only a single sample specified in its header + * (though there may be multiple read groups for that sample). Each line of the mapping file must contain the absolute + * path to a BAM file, followed by whitespace, followed by the new sample name for that BAM file. + */ + @Advanced + @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) + public File sampleRenameMappingFile = null; + /** + * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. + */ + @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) + public ValidationExclusion.TYPE unsafe; + /** + * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking + * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index + * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it + * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general + * because it allows reading from index files without first acquiring a lock. + */ + @Hidden + @Advanced + @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", + doc = "Disable both auto-generation of index files and index file locking", + required = false) + public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing + * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores. + * See online documentation FAQs for more information. + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1) + public Integer numberOfDataThreads = 1; + + /** + * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than + * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. See online documentation FAQs for more information. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0) + @Hidden + public int numberOfIOThreads = 0; + + /** + * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny + * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for + * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. + */ + @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false) + public Boolean monitorThreadEfficiency = false; + + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1) + public Integer numberOfBAMFileHandles = null; + /** + * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line. + */ + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false) + public List readGroupBlackList = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // PED (pedigree) support + // + // -------------------------------------------------------------------------------------------------------------- + + /** + *

Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

+ * + * + * + *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

+ * + *
    + *
  • Family ID
  • + *
  • Individual ID
  • + *
  • Paternal ID
  • + *
  • Maternal ID
  • + *
  • Sex (1=male; 2=female; other=unknown)
  • + *
  • Phenotype
  • + *
+ * + *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

+ * + *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

+ * + *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

+ * + *

Affection status should be coded:

+ * + *
    + *
  • -9 missing
  • + *
  • 0 missing
  • + *
  • 1 unaffected
  • + *
  • 2 affected
  • + *
+ * + *

If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value.

+ * + *

Genotypes (column 7 onwards) cannot be specified to the GATK.

+ * + *

For example, here are two individuals (one row = one person):

+ * + *
+     *   FAM001  1  0 0  1  2
+     *   FAM001  2  0 0  1  2
+     * 
+ * + *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

+ * + *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing.

+ */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); + + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); + + /** + * How strict should we be in parsing the PED files? + */ + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * NO INTEGRATION TESTS are available. Use at your own risk. + */ + @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false) + @Hidden + public boolean allowIntervalsWithUnindexedBAM = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing BCF2 + // + // -------------------------------------------------------------------------------------------------------------- + /** + * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes. + */ + @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false) + @Hidden + public boolean generateShadowBCF = false; + // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // VCF/BCF index parameters + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Specify the Tribble indexing strategy to use for VCFs. + * + * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter + * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter + * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + */ + @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false) + @Advanced + public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + /** + * This is either the bin width or the number of features per bin, depending on the indexing strategy + */ + @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) + @Advanced + public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; +} + diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/View.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/View.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java new file mode 100644 index 000000000..4f680ffc3 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -0,0 +1,1179 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.sam.MergingSamRecordIterator; +import net.sf.picard.sam.SamFileHeaderMerger; +import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.util.RuntimeIOException; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.ReadMetrics; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.*; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.baq.ReadTransformingIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; + +import java.io.File; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.*; +import java.util.concurrent.Callable; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

+ * Converts shards to SAM iterators over the specified region + */ +public class SAMDataSource { + final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); + + /** Backing support for reads. */ + protected final ReadProperties readProperties; + + /** + * Runtime metrics of reads filtered, etc. + */ + private final ReadMetrics readMetrics; + + /** + * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. + */ + protected final GenomeLocParser genomeLocParser; + + /** + * Identifiers for the readers driving this data source. + */ + private final Collection readerIDs; + + /** + * How strict are the readers driving this data source. + */ + private final SAMFileReader.ValidationStringency validationStringency; + + /** + * Do we want to remove the program records from this data source? + */ + private final boolean removeProgramRecords; + + /** + * Store BAM indices for each reader present. + */ + private final Map bamIndices = new HashMap(); + + /** + * The merged header. + */ + private final SAMFileHeader mergedHeader; + + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + + /** + * The sort order of the BAM files. Files without a sort order tag are assumed to be + * in coordinate order. + */ + private SAMFileHeader.SortOrder sortOrder = null; + + /** + * Whether the read groups in overlapping files collide. + */ + private final boolean hasReadGroupCollisions; + + /** + * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids + * are always unique, we can simply use a map here, no need to stratify by reader. + */ + private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); + + /** + * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified + * by readers, since there can be readgroup id collision: different bam files (readers) can list the + * same read group id, which will be disambiguated when these input streams are merged. + */ + private final Map originalToMergedReadGroupMappings = new HashMap(); + + /** + * Mapping from bam file ID to new sample name. Used only when doing on-the-fly sample renaming. + */ + private Map sampleRenameMap = null; + + /** our log, which we want to capture anything from this class */ + private static Logger logger = Logger.getLogger(SAMDataSource.class); + + /** + * A collection of readers driving the merging process. + */ + private final SAMResourcePool resourcePool; + + /** + * Asynchronously loads BGZF blocks. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; + + /** + * Create a new SAM data source given the supplied read metadata. + * + * For testing purposes + * + * @param samFiles list of reads files. + */ + public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { + this( + samFiles, + threadAllocation, + numFileHandles, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } + + /** + * See complete constructor. Does not enable BAQ by default. + * + * For testing purposes + */ + public SAMDataSource( + Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + SAMFileReader.ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + boolean includeReadsWithDeletionAtLoci) { + this( samFiles, + threadAllocation, + numFileHandles, + genomeLocParser, + useOriginalBaseQualities, + strictness, + readBufferSize, + downsamplingMethod, + exclusionList, + supplementalFilters, + Collections.emptyList(), + includeReadsWithDeletionAtLoci, + (byte) -1, + false, + false, + null); + } + + /** + * Create a new SAM data source given the supplied read metadata. + * @param samFiles list of reads files. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param strictness Stringency of reads file parsing. + * @param readBufferSize Number of reads to hold in memory per BAM. + * @param downsamplingMethod Method for downsampling reads at a given locus. + * @param exclusionList what safety checks we're willing to let slide + * @param supplementalFilters additional filters to dynamically apply. + * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method + * will explicitly list reads with deletion over the current reference base; otherwise, only observed + * bases will be seen in the pileups, and the deletions will be skipped silently. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? + * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. + * Will be null if we're not doing sample renaming. + */ + public SAMDataSource( + Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + SAMFileReader.ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + List readTransformers, + boolean includeReadsWithDeletionAtLoci, + byte defaultBaseQualities, + boolean removeProgramRecords, + final boolean keepReadsInLIBS, + final Map sampleRenameMap) { + + this.readMetrics = new ReadMetrics(); + this.genomeLocParser = genomeLocParser; + + readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) { + logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + } + else + dispatcher = null; + + validationStringency = strictness; + this.removeProgramRecords = removeProgramRecords; + if(readBufferSize != null) + ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests + else { + // Choose a sensible default for the read buffer size. + // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. + // Now we are simply setting it to 100K reads + ReadShard.setReadBufferSize(100000); + } + + this.sampleRenameMap = sampleRenameMap; + + resourcePool = new SAMResourcePool(Integer.MAX_VALUE); + SAMReaders readers = resourcePool.getAvailableReaders(); + + // Determine the sort order. + for(SAMReaderID readerID: readerIDs) { + if (! readerID.samFile.canRead() ) + throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + + "Please check that the file is present and readable and try again."); + + // Get the sort order, forcing it to coordinate if unsorted. + SAMFileReader reader = readers.getReader(readerID); + SAMFileHeader header = reader.getFileHeader(); + + headers.put(readerID,header); + + if ( header.getReadGroups().isEmpty() ) { + throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, + "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); + } + + SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; + + // Validate that all input files are sorted in the same order. + if(this.sortOrder != null && this.sortOrder != sortOrder) + throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder)); + + // Update the sort order. + this.sortOrder = sortOrder; + } + + mergedHeader = readers.getMergedHeader(); + hasReadGroupCollisions = readers.hasReadGroupCollisions(); + + readProperties = new ReadProperties( + samFiles, + mergedHeader, + sortOrder, + useOriginalBaseQualities, + strictness, + downsamplingMethod, + exclusionList, + supplementalFilters, + readTransformers, + includeReadsWithDeletionAtLoci, + defaultBaseQualities, + keepReadsInLIBS); + + // cache the read group id (original) -> read group id (merged) + // and read group id (merged) -> read group id (original) mappings. + for(SAMReaderID id: readerIDs) { + SAMFileReader reader = readers.getReader(id); + checkForReducedBamFile(reader.getFileHeader()); + + ReadGroupMapping mappingToMerged = new ReadGroupMapping(); + + List readGroups = reader.getFileHeader().getReadGroups(); + for(SAMReadGroupRecord readGroup: readGroups) { + if(hasReadGroupCollisions) { + mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); + mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); + } else { + mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + } + } + + originalToMergedReadGroupMappings.put(id,mappingToMerged); + } + + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.samFile); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); + } + + resourcePool.releaseReaders(readers); + } + + /** + * Checks whether the provided SAM header if from a reduced bam file. + * @param header the SAM header for a given file + * @throws UserException if the header is from a reduced bam + */ + private void checkForReducedBamFile(final SAMFileHeader header) { + if ( header.getProgramRecord("GATK ReduceReads") != null ) + throw new UserException("The GATK no longer supports running off of BAMs produced by ReduceReads"); + } + + public void close() { + SAMReaders readers = resourcePool.getAvailableReaders(); + for(SAMReaderID readerID: readerIDs) { + SAMFileReader reader = readers.getReader(readerID); + reader.close(); + } + } + + /** + * Returns Reads data structure containing information about the reads data sources placed in this pool as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public ReadProperties getReadsInfo() { return readProperties; } + + /** + * Checks to see whether any reads files are supplying data. + * @return True if no reads files are supplying data to the traversal; false otherwise. + */ + public boolean isEmpty() { + return readProperties.getSAMReaderIDs().size() == 0; + } + + /** + * Gets the SAM file associated with a given reader ID. + * @param id The reader for which to retrieve the source file. + * @return the file actually associated with the id. + */ + public File getSAMFile(SAMReaderID id) { + return id.samFile; + } + + /** + * Returns readers used by this data source. + * @return A list of SAM reader IDs. + */ + public Collection getReaderIDs() { + return readerIDs; + } + + /** + * Retrieves the id of the reader which built the given read. + * @param read The read to test. + * @return ID of the reader. + */ + public SAMReaderID getReaderID(SAMRecord read) { + return resourcePool.getReaderID(read.getFileSource().getReader()); + } + + /** + * Gets the merged header from the SAM file. + * @return The merged header. + */ + public SAMFileHeader getHeader() { + return mergedHeader; + } + + public SAMFileHeader getHeader(SAMReaderID id) { + return headers.get(id); + } + + /** + * Gets the revised read group id mapped to this 'original' read group id. + * @param reader for which to grab a read group. + * @param originalReadGroupId ID of the original read group. + * @return Merged read group ID. + */ + public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { + return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); + } + + /** + * Gets the original read group id (as it was specified in the original input bam file) that maps onto + * this 'merged' read group id. + * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). + * @return Merged read group ID. + */ + public String getOriginalReadGroupId(final String mergedReadGroupId) { + return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); + } + + /** + * True if all readers have an index. + * @return True if all readers have an index. + */ + public boolean hasIndex() { + return readerIDs.size() == bamIndices.size(); + } + + /** + * Gets the index for a particular reader. Always preloaded. + * @param id Id of the reader. + * @return The index. Will preload the index if necessary. + */ + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); + } + + /** + * Retrieves the sort order of the readers. + * @return Sort order. Can be unsorted, coordinate order, or query name order. + */ + public SAMFileHeader.SortOrder getSortOrder() { + return sortOrder; + } + + /** + * Gets the cumulative read metrics for shards already processed. + * @return Cumulative read metrics. + */ + public ReadMetrics getCumulativeReadMetrics() { + // don't return a clone here because the engine uses a pointer to this object + return readMetrics; + } + + /** + * Incorporate the given read metrics into the cumulative read metrics. + * @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics. + */ + public void incorporateReadMetrics(final ReadMetrics readMetrics) { + this.readMetrics.incrementMetrics(readMetrics); + } + + public StingSAMIterator seek(Shard shard) { + if(shard.buffersReads()) { + return shard.iterator(); + } + else { + return getIterator(shard); + } + } + + /** + * Gets the reader associated with the given read. + * @param readers Available readers. + * @param read + * @return + */ + private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { + for(SAMReaderID id: getReaderIDs()) { + if(readers.getReader(id) == read.getFileSource().getReader()) + return id; + } + throw new ReviewedStingException("Unable to find id for reader associated with read " + read.getReadName()); + } + + /** + * Get the initial reader positions across all BAM files + * + * @return the start positions of the first chunk of reads for all BAM files + */ + protected Map getInitialReaderPositions() { + Map initialPositions = new HashMap(); + SAMReaders readers = resourcePool.getAvailableReaders(); + + for ( SAMReaderID id: getReaderIDs() ) { + initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); + } + + resourcePool.releaseReaders(readers); + return initialPositions; + } + + /** + * Get an iterator over the data types specified in the shard. + * + * @param shard The shard specifying the data limits. + * @return An iterator over the selected data. + */ + protected StingSAMIterator getIterator( Shard shard ) { + return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); + } + + /** + * Get an iterator over the data types specified in the shard. + * @param readers Readers from which to load data. + * @param shard The shard specifying the data limits. + * @param enableVerification True to verify. For compatibility with old sharding strategy. + * @return An iterator over the selected data. + */ + private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { + // Set up merging to dynamically merge together multiple BAMs. + Map> iteratorMap = new HashMap>(); + + for(SAMReaderID id: getReaderIDs()) { + CloseableIterator iterator = null; + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + try { + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); + } + else { + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + } + } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes + throw new UserException.MalformedBAM(id.samFile, e.getMessage()); + } + + iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); + if(shard.getGenomeLocs().size() > 0) + iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); + + iteratorMap.put(readers.getReader(id), iterator); + } + + MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); + + // The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's + // metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when + // we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen. + return applyDecoratingIterators(readMetrics, + enableVerification, + readProperties.useOriginalBaseQualities(), + new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), + readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + readProperties.getSupplementalFilters(), + readProperties.getReadTransformers(), + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); + } + + private class BAMCodecIterator implements CloseableIterator { + private final BlockInputStream inputStream; + private final SAMFileReader reader; + private final BAMRecordCodec codec; + private SAMRecord nextRead; + + private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { + this.inputStream = inputStream; + this.reader = reader; + this.codec = codec; + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); + } + + private void advance() { + final long startCoordinate = inputStream.getFilePointer(); + nextRead = codec.decode(); + final long stopCoordinate = inputStream.getFilePointer(); + + if(reader != null && nextRead != null) + PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate)))); + } + } + + /** + * Filter reads based on user-specified criteria. + * + * @param readMetrics metrics to track when using this iterator. + * @param enableVerification Verify the order of reads. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param wrappedIterator the raw data source. + * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. + * @param supplementalFilters additional filters to apply to the reads. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard + * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. + */ + protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, + boolean enableVerification, + boolean useOriginalBaseQualities, + StingSAMIterator wrappedIterator, + Boolean noValidationOfReadOrder, + Collection supplementalFilters, + List readTransformers, + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { + + // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, + // this will consolidate the cigar strings into canonical form. This has to be done before the read + // filtering, because not all read filters will behave correctly with things like zero-length cigar + // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also + // modify the base qualities. + wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); + + // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads + // that actually survive filtering. Otherwise we could get much less coverage than requested. + wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + + // Downsampling: + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + // Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be + // doing any downsampling downstream of us + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } + + // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, + // verify the read ordering by applying a sort order iterator + if (!noValidationOfReadOrder && enableVerification) + wrappedIterator = new VerifyingSamIterator(wrappedIterator); + + // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded + // by the read filters or downsampler. + for ( final ReadTransformer readTransformer : readTransformers ) { + if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) + wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); + } + + return wrappedIterator; + } + + protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; + } + + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } + } + + return wrappedIterator; + } + + + private class SAMResourcePool { + /** + * How many entries can be cached in this resource pool? + */ + private final int maxEntries; + + /** + * All iterators of this reference-ordered data. + */ + private List allResources = new ArrayList(); + + /** + * All iterators that are not currently in service. + */ + private List availableResources = new ArrayList(); + + public SAMResourcePool(final int maxEntries) { + this.maxEntries = maxEntries; + } + + /** + * Choose a set of readers from the pool to use for this query. When complete, + * @return + */ + public synchronized SAMReaders getAvailableReaders() { + if(availableResources.size() == 0) + createNewResource(); + SAMReaders readers = availableResources.get(0); + availableResources.remove(readers); + return readers; + } + + public synchronized void releaseReaders(SAMReaders readers) { + if(!allResources.contains(readers)) + throw new ReviewedStingException("Tried to return readers from the pool that didn't originate in the pool."); + availableResources.add(readers); + } + + /** + * Gets the reader id for the given reader. + * @param reader Reader for which to determine the id. + * @return id of the given reader. + */ + protected synchronized SAMReaderID getReaderID(SAMFileReader reader) { + for(SAMReaders readers: allResources) { + SAMReaderID id = readers.getReaderID(reader); + if(id != null) + return id; + } + throw new ReviewedStingException("No such reader id is available"); + } + + private synchronized void createNewResource() { + if(allResources.size() > maxEntries) + throw new ReviewedStingException("Cannot create a new resource pool. All resources are in use."); + SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); + allResources.add(readers); + availableResources.add(readers); + } + + } + + /** + * A collection of readers derived from a reads metadata structure. + */ + private class SAMReaders implements Iterable { + /** + * Cached representation of the merged header used to generate a merging iterator. + */ + private final SamFileHeaderMerger headerMerger; + + /** + * Internal storage for a map of id -> reader. + */ + private final Map readers = new LinkedHashMap(); + + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + + /** + * Derive a new set of readers from the Reads metadata. + * @param readerIDs reads to load. + * TODO: validationStringency is not used here + * @param validationStringency validation stringency. + * @param removeProgramRecords indicate whether to clear program records from the readers + */ + public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency, boolean removeProgramRecords) { + final int totalNumberOfFiles = readerIDs.size(); + int readerNumber = 1; + final SimpleTimer timer = new SimpleTimer().start(); + + if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); + final int tickSize = 50; + int nExecutedTotal = 0; + long lastTick = timer.currentTime(); + for(final SAMReaderID readerID: readerIDs) { + final ReaderInitializer init = new ReaderInitializer(readerID).call(); + + if (removeProgramRecords) { + init.reader.getFileHeader().setProgramRecords(new ArrayList()); + } + + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); + readers.put(init.readerID,init.reader); + if ( ++nExecutedTotal % tickSize == 0) { + double tickInSec = (timer.currentTime() - lastTick) / 1000.0; + printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); + lastTick = timer.currentTime(); + } + } + + if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + + Collection headers = new LinkedList(); + + // Examine the bam headers, perform any requested sample renaming on them, and add + // them to the list of headers to pass to the Picard SamFileHeaderMerger: + for ( final Map.Entry readerEntry : readers.entrySet() ) { + final SAMReaderID readerID = readerEntry.getKey(); + final SAMFileReader reader = readerEntry.getValue(); + final SAMFileHeader header = reader.getFileHeader(); + + // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, + // or the user's sample rename map file didn't contain an entry for this bam file: + final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID) : null; + + // If we've been asked to rename the sample for this bam file, do so now. We'll check to + // make sure this bam only contains reads from one sample before proceeding. + // + // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of + // the existing read group attributes (including sample name) when merging + // headers, regardless of whether there are read group collisions or not. + if ( remappedSampleName != null ) { + remapSampleName(readerID, header, remappedSampleName); + } + + headers.add(header); + } + + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); + + // update all read groups to GATKSAMRecordReadGroups + final List gatkReadGroups = new LinkedList(); + for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { + gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); + } + headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); + } + + /** + * Changes the sample name in the read groups for the provided bam file header to match the + * remappedSampleName. Blows up with a UserException if the header contains more than one + * sample name. + * + * @param readerID ID for the bam file from which the provided header came from + * @param header The bam file header. Will be modified by this call. + * @param remappedSampleName New sample name to replace the existing sample attribute in the + * read groups for the header. + */ + private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { + String firstEncounteredSample = null; + + for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { + final String thisReadGroupSample = readGroup.getSample(); + + if ( thisReadGroupSample == null ) { + throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + + "bam file contains a read group (id: %s) with a null sample attribute", + readerID.getSamFilePath(), readGroup.getId())); + } + else if ( firstEncounteredSample == null ) { + firstEncounteredSample = thisReadGroupSample; + } + else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { + throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + + "however this bam file contains reads from more than one sample " + + "(encountered samples %s and %s in the bam header). The GATK requires that " + + "all bams for which on-the-fly sample renaming is requested " + + "contain reads from only a single sample per bam.", + readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); + } + + readGroup.setSample(remappedSampleName); + } + } + + final private void printReaderPerformance(final int nExecutedTotal, + final int nExecutedInTick, + final int totalNumberOfFiles, + final SimpleTimer timer, + final double tickDurationInSec) { + final int pendingSize = totalNumberOfFiles - nExecutedTotal; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, tickDurationInSec, + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + } + + /** + * Return the header derived from the merging of these BAM files. + * @return the merged header. + */ + public SAMFileHeader getMergedHeader() { + return headerMerger.getMergedHeader(); + } + + /** + * Do multiple read groups collide in this dataset? + * @return True if multiple read groups collide; false otherwis. + */ + public boolean hasReadGroupCollisions() { + return headerMerger.hasReadGroupCollisions(); + } + + /** + * Get the newly mapped read group ID for the given read group. + * @param readerID Reader for which to discern the transformed ID. + * @param originalReadGroupID Original read group. + * @return Remapped read group. + */ + public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { + SAMFileHeader header = readers.get(readerID).getFileHeader(); + return headerMerger.getReadGroupId(header,originalReadGroupID); + } + + /** + * Creates a new merging iterator from the given map, with the given header. + * @param iteratorMap A map of readers to iterators. + * @return An iterator which will merge those individual iterators. + */ + public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { + return new MergingSamRecordIterator(headerMerger,iteratorMap,true); + } + + /** + * Retrieve the reader from the data structure. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public SAMFileReader getReader(SAMReaderID id) { + if(!readers.containsKey(id)) + throw new NoSuchElementException("No reader is associated with id " + id); + return readers.get(id); + } + + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + + /** + * Searches for the reader id of this reader. + * @param reader Reader for which to search. + * @return The id associated the given reader, or null if the reader is not present in this collection. + */ + protected SAMReaderID getReaderID(SAMFileReader reader) { + for(Map.Entry entry: readers.entrySet()) { + if(reader == entry.getValue()) + return entry.getKey(); + } + // Not found? return null. + return null; + } + + /** + * Returns an iterator over all readers in this structure. + * @return An iterator over readers. + */ + public Iterator iterator() { + return readers.values().iterator(); + } + + /** + * Returns whether any readers are present in this structure. + * @return + */ + public boolean isEmpty() { + return readers.isEmpty(); + } + } + + class ReaderInitializer implements Callable { + final SAMReaderID readerID; + BlockInputStream blockInputStream = null; + SAMFileReader reader; + + public ReaderInitializer(final SAMReaderID readerID) { + this.readerID = readerID; + } + + public ReaderInitializer call() { + final File indexFile = findIndexFile(readerID.samFile); + try { + if (threadAllocation.getNumIOThreads() > 0) + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(readerID.samFile,indexFile,false); + } catch ( RuntimeIOException e ) { + throw new UserException.CouldNotReadInputFile(readerID.samFile, e); + } catch ( SAMFormatException e ) { + throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); + } + // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). + // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, + // just in case we want to change this behavior later. + catch ( RuntimeException e ) { + throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); + } + reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); + reader.setValidationStringency(validationStringency); + return this; + } + } + + private class ReleasingIterator implements StingSAMIterator { + /** + * The resource acting as the source of the data. + */ + private final SAMReaders resource; + + /** + * The iterator to wrap. + */ + private final StingSAMIterator wrappedIterator; + + public ReleasingIterator(SAMReaders resource, StingSAMIterator wrapped) { + this.resource = resource; + this.wrappedIterator = wrapped; + } + + public ReleasingIterator iterator() { + return this; + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); + } + + public void close() { + wrappedIterator.close(); + resourcePool.releaseReaders(resource); + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecord next() { + return wrappedIterator.next(); + } + } + + /** + * Maps read groups in the original SAMFileReaders to read groups in + */ + private class ReadGroupMapping extends HashMap {} + + /** + * Locates the index file alongside the given BAM, if present. + * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. + * @param bamFile The data file to use. + * @return A File object if the index file is present; null otherwise. + */ + private File findIndexFile(File bamFile) { + File indexFile; + + try { + Class bamFileReaderClass = Class.forName("net.sf.samtools.BAMFileReader"); + Method indexFileLocator = bamFileReaderClass.getDeclaredMethod("findIndexFile",File.class); + indexFileLocator.setAccessible(true); + indexFile = (File)indexFileLocator.invoke(null,bamFile); + } + catch(ClassNotFoundException ex) { + throw new ReviewedStingException("Unable to locate BAMFileReader class, used to check for index files"); + } + catch(NoSuchMethodException ex) { + throw new ReviewedStingException("Unable to locate Picard index file locator."); + } + catch(IllegalAccessException ex) { + throw new ReviewedStingException("Unable to access Picard index file locator."); + } + catch(InvocationTargetException ex) { + throw new ReviewedStingException("Unable to invoke Picard index file locator."); + } + + return indexFile; + } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * + * @param shardBalancer shard balancer object + * @return non-null initialized version of the shard balancer + */ + public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); + return shardBalancer; + } +} + + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/Shard.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/Shard.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java new file mode 100644 index 000000000..56c370276 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -0,0 +1,368 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.DefaultHashMap; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +import org.apache.log4j.Logger; + +public class AlleleBiasedDownsamplingUtils { + + // define this class so that we can use Java generics below + private final static class PileupElementList extends ArrayList {} + + /** + * Computes an allele biased version of the given pileup + * + * @param pileup the original pileup + * @param downsamplingFraction the fraction of total reads to remove per allele + * @return allele biased pileup + */ + public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return pileup; + if ( downsamplingFraction >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new PileupElementList(); + + // start by stratifying the reads by the alleles they represent at this position + for ( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // make a listing of allele counts and calculate the total count + final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements); + final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); + + // do smart down-sampling + final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final HashSet readsToRemove = new HashSet(numReadsToRemove); + for ( int i = 0; i < 4; i++ ) { + final PileupElementList alleleList = alleleStratifiedElements[i]; + // if we don't need to remove any reads, then don't + if ( alleleCounts[i] > targetAlleleCounts[i] ) + readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); + } + + // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise + final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); + for ( final PileupElement pe : pileup ) { + if ( !readsToRemove.contains(pe) ) { + readsToKeep.add(pe); + } + } + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); + } + + /** + * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) + * + * @param alleleStratifiedElements pileup elements stratified by allele + * @return non-null int array representing allele counts + */ + private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements) { + final int[] alleleCounts = new int[alleleStratifiedElements.length]; + for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { + alleleCounts[i] = alleleStratifiedElements[i].size(); + } + return alleleCounts; + } + + private static int scoreAlleleCounts(final int[] alleleCounts) { + if ( alleleCounts.length < 2 ) + return 0; + + // sort the counts (in ascending order) + final int[] alleleCountsCopy = alleleCounts.clone(); + Arrays.sort(alleleCountsCopy); + + final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; + final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; + + int remainderCount = 0; + for ( int i = 0; i < alleleCounts.length - 2; i++ ) + remainderCount += alleleCountsCopy[i]; + + // try to get the best score: + // - in the het case the counts should be equal with nothing else + // - in the hom case the non-max should be zero + return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); + } + + /** + * Computes an allele biased version of the allele counts for a given pileup + * + * @param alleleCounts the allele counts for the original pileup + * @param numReadsToRemove number of total reads to remove per allele + * @return non-null array of new counts needed per allele + */ + protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { + final int numAlleles = alleleCounts.length; + + int maxScore = scoreAlleleCounts(alleleCounts); + int[] alleleCountsOfMax = alleleCounts; + + final int numReadsToRemovePerAllele = numReadsToRemove / 2; + + for ( int i = 0; i < numAlleles; i++ ) { + for ( int j = i; j < numAlleles; j++ ) { + final int[] newCounts = alleleCounts.clone(); + + // split these cases so we don't lose on the floor (since we divided by 2) + if ( i == j ) { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); + } else { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); + newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); + } + + final int score = scoreAlleleCounts(newCounts); + + if ( score < maxScore ) { + maxScore = score; + alleleCountsOfMax = newCounts; + } + } + } + + return alleleCountsOfMax; + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param elements original list of pileup elements + * @param originalElementCount original count of elements (taking reduced reads into account) + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(elements); + return elementsToRemove; + } + + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + int currentBitSetIndex = 0; + for ( final PileupElement element : elements ) { + if ( itemsToRemove.get(currentBitSetIndex++) ) { + elementsToRemove.add(element); + } + } + + return elementsToRemove; + } + + /** + * Computes reads to remove based on an allele biased down-sampling + * + * @param alleleReadMap original list of records per allele + * @param downsamplingFraction the fraction of total reads to remove per allele + * @return list of reads TO REMOVE from allele biased down-sampling + */ + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { + int totalReads = 0; + for ( final List reads : alleleReadMap.values() ) + totalReads += reads.size(); + + int numReadsToRemove = (int)(totalReads * downsamplingFraction); + + // make a listing of allele counts + final List alleles = new ArrayList(alleleReadMap.keySet()); + alleles.remove(Allele.NO_CALL); // ignore the no-call bin + final int numAlleles = alleles.size(); + + final int[] alleleCounts = new int[numAlleles]; + for ( int i = 0; i < numAlleles; i++ ) + alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final List readsToRemove = new ArrayList(numReadsToRemove); + for ( int i = 0; i < numAlleles; i++ ) { + if ( alleleCounts[i] > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); + } + } + + return readsToRemove; + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List reads, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + final int originalElementCount = reads.size(); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(reads); + return elementsToRemove; + } + + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + int currentBitSetIndex = 0; + for ( final GATKSAMRecord read : reads ) { + if ( itemsToRemove.get(currentBitSetIndex++) ) + elementsToRemove.add(read); + } + + return elementsToRemove; + } + + /** + * Create sample-contamination maps from file + * + * @param ContaminationFractionFile Filename containing two columns: SampleID and Contamination + * @param AvailableSampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking + * @param logger for logging output + * @return sample-contamination Map + */ + + public static DefaultHashMap loadContaminationFile(File ContaminationFractionFile, final Double defaultContaminationFraction, final Set AvailableSampleIDs, Logger logger) throws StingException { + DefaultHashMap sampleContamination = new DefaultHashMap(defaultContaminationFraction); + Set nonSamplesInContaminationFile = new HashSet(sampleContamination.keySet()); + try { + + XReadLines reader = new XReadLines(ContaminationFractionFile, true); + for (String line : reader) { + + if (line.length() == 0) { + continue; + } + + StringTokenizer st = new StringTokenizer(line,"\t"); + + String fields[] = new String[2]; + try { + fields[0] = st.nextToken(); + fields[1] = st.nextToken(); + } catch(NoSuchElementException e){ + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); + } + if(st.hasMoreTokens()) { + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); + } + + if (fields[0].length() == 0 || fields[1].length() == 0) { + throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line); + } + + if (sampleContamination.containsKey(fields[0])) { + throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + fields[0]); + } + + try { + final Double contamination = Double.valueOf(fields[1]); + if (contamination < 0 || contamination > 1){ + throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line); + } + if (AvailableSampleIDs==null || AvailableSampleIDs.contains(fields[0])) {// only add samples if they are in the sampleSet (or if it is null) + sampleContamination.put(fields[0], contamination); + } + else { + nonSamplesInContaminationFile.add(fields[0]); + } + } catch (NumberFormatException e) { + throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line); + } + } + + + //output to the user info lines telling which samples are in the Contamination File + if (sampleContamination.size() > 0) { + logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString())); + + //output to the user info lines telling which samples are NOT in the Contamination File + if(AvailableSampleIDs!=null){ + Set samplesNotInContaminationFile = new HashSet(AvailableSampleIDs); + samplesNotInContaminationFile.removeAll(sampleContamination.keySet()); + if (samplesNotInContaminationFile.size() > 0) + logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString())); + } + } + + //output to the user Samples that do not have lines in the Contamination File + if (nonSamplesInContaminationFile.size() > 0) { + logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString())); + } + + return sampleContamination; + + } catch (IOException e) { + throw new StingException("I/O Error while reading sample-contamination file " + ContaminationFractionFile.getName() + ": " + e.getMessage()); + } + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java new file mode 100644 index 000000000..7b42f75f9 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -0,0 +1,161 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import java.util.Collection; +import java.util.List; + +/** + * The basic downsampler API, with no reads-specific operations. + * + * Downsamplers that extend this class rather than the ReadsDownsampler class can handle + * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a + * PerSampleDownsamplingReadsIterator. + * + * @author David Roazen + */ +public abstract class Downsampler { + + /** + * Number of items discarded by this downsampler since the last call to resetStats() + */ + protected int numDiscardedItems = 0; + + /** + * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine + * immediately whether the item survives the downsampling process, while others will need to see + * more items before making that determination. + * + * @param item the individual item to submit to the downsampler for consideration + */ + public abstract void submit( final T item ); + + /** + * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling + * submit() on each individual item in the collection. + * + * @param items the collection of items to submit to the downsampler for consideration + */ + public void submit( final Collection items ) { + if ( items == null ) { + throw new IllegalArgumentException("submitted items must not be null"); + } + + for ( final T item : items ) { + submit(item); + } + } + + /** + * Are there items that have survived the downsampling process waiting to be retrieved? + * + * @return true if this downsampler has > 0 finalized items, otherwise false + */ + public abstract boolean hasFinalizedItems(); + + /** + * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. + * + * @return a list of all finalized items this downsampler contains, or an empty list if there are none + */ + public abstract List consumeFinalizedItems(); + + /** + * Are there items stored in this downsampler that it doesn't yet know whether they will + * ultimately survive the downsampling process? + * + * @return true if this downsampler has > 0 pending items, otherwise false + */ + public abstract boolean hasPendingItems(); + + /** + * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) + * + * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public abstract T peekFinalized(); + + /** + * Peek at the first pending item stored in this downsampler (or null if there are no pending items) + * + * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public abstract T peekPending(); + + /** + * Get the current number of items in this downsampler + * + * This should be the best estimate of the total number of elements that will come out of the downsampler + * were consumeFinalizedItems() to be called immediately after this call. In other words it should + * be number of finalized items + estimate of number of pending items that will ultimately be included as well. + * + * @return a positive integer + */ + public abstract int size(); + + /** + * Returns the number of items discarded (so far) during the downsampling process + * + * @return the number of items that have been submitted to this downsampler and discarded in the process of + * downsampling + */ + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + /** + * Used to tell the downsampler that no more items will be submitted to it, and that it should + * finalize any pending items. + */ + public abstract void signalEndOfInput(); + + /** + * Empty the downsampler of all finalized/pending items + */ + public abstract void clearItems(); + + /** + * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items + */ + public void resetStats() { + numDiscardedItems = 0; + } + + /** + * Indicates whether an item should be excluded from elimination during downsampling. By default, + * all items representing reduced reads are excluded from downsampling, but individual downsamplers + * may override if they are able to handle reduced reads correctly. Downsamplers should check + * the return value of this method before discarding an item. + * + * @param item The item to test + * @return true if the item should not be subject to elimination during downsampling, otherwise false + */ + protected boolean doNotDiscardItem( final Object item ) { + return false; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/Accumulator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/Accumulator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroScheduler.java new file mode 100644 index 000000000..405c07392 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -0,0 +1,463 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.executive; + +import com.google.java.contract.Ensures; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.ReadMetrics; +import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.traversals.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; + +import javax.management.JMException; +import javax.management.MBeanServer; +import javax.management.ObjectName; +import java.io.File; +import java.lang.management.ManagementFactory; +import java.util.*; + + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Apr 26, 2009 + * Time: 12:37:23 PM + * + * General base class for all scheduling algorithms + * Shards and schedules data in manageable chunks. + * + * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary + * because in the HMS case you have multiple threads executing a traversal engine independently, and + * these engines may need to create separate resources for efficiency or implementation reasons. For example, + * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. + * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have + * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler + * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler + * can properly shut them all down when the scheduling is done. + * + */ +public abstract class MicroScheduler implements MicroSchedulerMBean { + protected static final Logger logger = Logger.getLogger(MicroScheduler.class); + + /** + * The list of all Traversal engines we've created in this micro scheduler + */ + final List allCreatedTraversalEngines = new LinkedList(); + + /** + * All available engines. Engines are borrowed and returned when a subclass is actually + * going to execute the engine on some data. This allows us to have N copies for + * N data parallel executions, but without the dangerous code of having local + * ThreadLocal variables. + */ + final LinkedList availableTraversalEngines = new LinkedList(); + + /** + * Engines that have been allocated to a key already. + */ + final HashMap allocatedTraversalEngines = new HashMap(); + + /** + * Counts the number of instances of the class that are currently alive. + */ + private static int instanceNumber = 0; + + /** + * The engine invoking this scheduler. + */ + protected final GenomeAnalysisEngine engine; + + protected final IndexedFastaSequenceFile reference; + + private final SAMDataSource reads; + protected final Collection rods; + + private final MBeanServer mBeanServer; + private final ObjectName mBeanName; + + /** + * Threading efficiency monitor for tracking the resource utilization of the GATK + * + * may be null + */ + ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * MicroScheduler factory function. Create a microscheduler appropriate for reducing the + * selected walker. + * + * @param walker Which walker to use. + * @param reads the informations associated with the reads + * @param reference the reference file + * @param rods the rods to include in the traversal + * @param threadAllocation Number of threads to utilize. + * + * @return The best-fit microscheduler. + */ + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if ( threadAllocation.isRunningInParallelMode() ) { + logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + + "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", + threadAllocation.getTotalNumThreads(), + threadAllocation.getNumCPUThreadsPerDataThread(), + threadAllocation.getNumDataThreads(), + Runtime.getRuntime().availableProcessors())); + if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) + logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + + "available processors on this machine %d", threadAllocation.getTotalNumThreads(), + Runtime.getRuntime().availableProcessors())); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + if (walker.isReduceByInterval()) + throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass()))); + + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); + } + } + + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { + throw badNT("nct", engine, walker); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } else { + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } + } + + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue(parallelArg, + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); + } + + /** + * Create a microscheduler given the reads and reference. + * + * @param walker the walker to execute with + * @param reads The reads. + * @param reference The reference. + * @param rods the rods to include in the traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal + */ + protected MicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final ThreadAllocation threadAllocation) { + this.engine = engine; + this.reads = reads; + this.reference = reference; + this.rods = rods; + + final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; + + // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, + // and adds it to the list of created engines for later shutdown. + for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { + final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); + allCreatedTraversalEngines.add(traversalEngine); + availableTraversalEngines.add(traversalEngine); + } + + // Create the progress meter, and register it with the analysis engine + engine.registerProgressMeter(new ProgressMeter(progressLogFile, + availableTraversalEngines.peek().getTraversalUnits(), + engine.getRegionsOfGenomeBeingProcessed())); + + // Now that we have a progress meter, go through and initialize the traversal engines + for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) + traversalEngine.initialize(engine, walker, engine.getProgressMeter()); + + // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. + // To get around this limitation and since we have no job identifier at this point, register a simple counter that + // will count the number of instances of this object that have been created in this JVM. + int thisInstance = instanceNumber++; + mBeanServer = ManagementFactory.getPlatformMBeanServer(); + try { + mBeanName = new ObjectName("org.broadinstitute.sting.gatk.executive:type=MicroScheduler,instanceNumber="+thisInstance); + mBeanServer.registerMBean(this, mBeanName); + } + catch (JMException ex) { + throw new ReviewedStingException("Unable to register microscheduler with JMX", ex); + } + } + + /** + * Really make us a traversal engine of the appropriate type for walker and thread allocation + * + * @return a non-null uninitialized traversal engine + */ + @Ensures("result != null") + private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { + if (walker instanceof ReadWalker) { + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof LocusWalker) { + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof DuplicateWalker) { + return new TraverseDuplicates(); + } else if (walker instanceof ReadPairWalker) { + return new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); + } + } + + + /** + * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + /** + * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses + * + * @param threadEfficiencyMonitor + */ + public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { + this.threadEfficiencyMonitor = threadEfficiencyMonitor; + } + + /** + * Should we stop all execution work and exit gracefully? + * + * Returns true in the case where some external signal or time limit has been received, indicating + * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown + * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler + * examine this value as often as reasonable and, if it returns true, stop what they are doing + * at the next available opportunity, shutdown their resources, call notify done, and return. + * + * @return true if we should abort execution, or false otherwise + */ + protected boolean abortExecution() { + final boolean abort = engine.exceedsRuntimeLimit(); + if ( abort ) { + final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); + logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); + } + return abort; + } + + /** + * Walks a walker over the given list of intervals. + * + * @param walker Computation to perform over dataset. + * @param shardStrategy A strategy for sharding the data. + * + * @return the return type of the walker + */ + public abstract Object execute(Walker walker, Iterable shardStrategy); + + /** + * Tells this MicroScheduler that the execution of one of the subclass of this object as started + * + * Must be called when the implementation of execute actually starts up + * + * Currently only starts the progress meter timer running, but other start up activities could be incorporated + */ + protected void startingExecution() { + engine.getProgressMeter().start(); + } + + /** + * Retrieves the object responsible for tracking and managing output. + * @return An output tracker, for loading data in and extracting results. Will not be null. + */ + public abstract OutputTracker getOutputTracker(); + + /** + * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. + * @param shard the shard to use when querying reads. + * @return an iterator over the reads specified in the shard. + */ + protected StingSAMIterator getReadIterator(Shard shard) { + return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); + } + + /** + * Must be called by subclasses when execute is done + */ + protected void executionIsDone() { + engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); + printReadFilteringStats(); + shutdownTraversalEngines(); + + // Print out the threading efficiency of this HMS, if state monitoring is enabled + if ( threadEfficiencyMonitor != null ) { + // include the master thread information + threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); + threadEfficiencyMonitor.printUsageInformation(logger); + } + } + + /** + * Shutdown all of the created engines, and clear the list of created engines, dropping + * pointers to the traversal engines + */ + public synchronized void shutdownTraversalEngines() { + for ( final TraversalEngine te : allCreatedTraversalEngines) + te.shutdown(); + + allCreatedTraversalEngines.clear(); + availableTraversalEngines.clear(); + } + + /** + * Prints out information about number of reads observed and filtering, if any reads were used in the traversal + * + * Looks like: + * + * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter + * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter + */ + private void printReadFilteringStats() { + final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); + if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; + + logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", + nSkippedReads, + cumulativeMetrics.getNumReadsSeen(), + 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); + + for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); + } + } + } + + /** + * Gets the engine that created this microscheduler. + * @return The engine owning this microscheduler. + */ + public GenomeAnalysisEngine getEngine() { return engine; } + + /** + * Returns data source maintained by this scheduler + * @return + */ + public SAMDataSource getSAMDataSource() { return reads; } + + /** + * Returns the reference maintained by this scheduler. + * @return The reference maintained by this scheduler. + */ + public IndexedFastaSequenceFile getReference() { return reference; } + + protected void cleanup() { + try { + mBeanServer.unregisterMBean(mBeanName); + } + catch (JMException ex) { + throw new ReviewedStingException("Unable to unregister microscheduler with JMX", ex); + } + } + + /** + * Returns a traversal engine suitable for use, associated with key + * + * Key is an arbitrary object that is used to retrieve the same traversal + * engine over and over. This can be important in the case where the + * traversal engine has data associated with it in some other context, + * and we need to ensure that the context always sees the same traversal + * engine. This happens in the HierarchicalMicroScheduler, where you want + * the a thread executing traversals to retrieve the same engine each time, + * as outputs are tracked w.r.t. that engine. + * + * If no engine is associated with key yet, pops the next available engine + * from the available ones maintained by this + * microscheduler. Note that it's a runtime error to pop a traversal engine + * from this scheduler if there are none available. Callers that + * once pop'd an engine for use must return it with returnTraversalEngine + * + * @param key the key to associate with this engine + * @return a non-null TraversalEngine suitable for execution in this scheduler + */ + @Ensures("result != null") + protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { + if ( key == null ) throw new IllegalArgumentException("key cannot be null"); + + final TraversalEngine engine = allocatedTraversalEngines.get(key); + if ( engine == null ) { + if ( availableTraversalEngines.isEmpty() ) + throw new IllegalStateException("no traversal engines were available"); + allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); + return allocatedTraversalEngines.get(key); + } else { + return engine; + } + } + + /** + * Return a borrowed traversal engine to this MicroScheduler, for later use + * in another traversal execution + * + * @param key the key used to id the engine, provided to the borrowTraversalEngine function + * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. + */ + protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { + if ( traversalEngine == null ) + throw new IllegalArgumentException("Attempting to push a null traversal engine"); + if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) + throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); + if ( ! allocatedTraversalEngines.containsKey(key) ) + throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); + + // note there's nothing to actually do here, but a function implementation + // might want to do something + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ReduceTree.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ReduceTree.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ShardTraverser.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ShardTraverser.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/TreeReducer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/TreeReducer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/WindowMaker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/WindowMaker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadMateFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadMateFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FilterManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FilterManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/Platform454Filter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/Platform454Filter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SampleFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SampleFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/FastqFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/FastqFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/FastqFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/FastqFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/OutputTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/OutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/Storage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/Storage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/Stub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/Stub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/IterableIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/IterableIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java new file mode 100644 index 000000000..e194a9c43 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -0,0 +1,497 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.refdata; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.*; + +/** + * This class represents the Reference Metadata available at a particular site in the genome. It can be + * used to conveniently lookup the RMDs at this site, as well just getting a list of all of the RMDs + * + * The standard interaction model is: + * + * Traversal system arrives at a site, which has a bunch of RMDs covering it + * Traversal passes creates a tracker and passes it to the walker + * walker calls get(rodBinding) to obtain the RMDs values at this site for the track + * associated with rodBinding. + * + * Note that this is an immutable class. Once created the underlying data structures + * cannot be modified + * + * User: mdepristo + * Date: Apr 3, 2009 + * Time: 3:05:23 PM + */ +public class RefMetaDataTracker { + // TODO: this should be a list, not a bindings, actually + private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); + + final Map bindings; + final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); + public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); + + // ------------------------------------------------------------------------------------------ + // + // + // Special ENGINE interaction functions + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Create an tracker with no bindings + */ + public RefMetaDataTracker() { + bindings = Collections.emptyMap(); + } + + public RefMetaDataTracker(final Collection allBindings) { + // set up the bindings + if ( allBindings.isEmpty() ) + bindings = Collections.emptyMap(); + else { + final Map tmap = new HashMap(allBindings.size()); + for ( RODRecordList rod : allBindings ) { + if ( rod != null && ! rod.isEmpty() ) + tmap.put(canonicalName(rod.getName()), rod); + } + + // ensure that no one modifies the bindings itself + bindings = Collections.unmodifiableMap(tmap); + } + } + + // ------------------------------------------------------------------------------------------ + // + // + // Generic accessors + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Gets all of the Tribble features spanning this locus, returning them as a list of specific + * type T extending Feature. This function looks across all tracks to find the Features, so + * if you have two tracks A and B each containing 1 Feature, then getValues will return + * a list containing both features. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. If you want + * to get all Features without any danger of such an exception use the root Tribble + * interface Feature. + * + * @param type The type of the underlying objects bound here + * @param as above + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"type != null"}) + @Ensures("result != null") + public List getValues(final Class type) { + return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); + } + + /** + * Provides the same functionality as @link #getValues(Class) but will only include + * Features that start as the GenomeLoc provide onlyAtThisLoc. + * + * @param type The type of the underlying objects bound here + * @param onlyAtThisLoc + * @param as above + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"type != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { + return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); + } + + /** + * Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting + * elements of the list to return. That is, if there would be two elements in the result of + * @link #getValues(Class), one of these two is selected, and which one it will be isn't + * specified. Consequently, this method is only really safe if (1) you absolutely know + * that only one binding will meet the constraints of @link #getValues(Class) or (2) + * you truly don't care which of the multiple bindings available you are going to examine. + * + * If there are no bindings here, getFirstValue() return null + * + * @param type The type of the underlying objects bound here + * @param as above + * @return A random single element the RODs bound here, or null if none are bound. + */ + @Requires({"type != null"}) + public T getFirstValue(final Class type) { + return safeGetFirst(getValues(type)); + } + + /** + * Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list + * of eligible Features and @link #getFirstValue(Class) to select a single + * element from the interval list. + * + * @param type The type of the underlying objects bound here + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound. + */ + @Requires({"type != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(getValues(type, onlyAtThisLoc)); + } + + /** + * Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param prioritizeThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "prioritizeThisLoc != null"}) + @Ensures("result != null") + public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) { + final List results = new ArrayList<>(); + + for ( final RodBinding rodBinding : rodBindings ) { + + // if there's a value at the prioritized location, take it + T value = getFirstValue(rodBinding, prioritizeThisLoc); + + // otherwise, grab any one + if ( value == null ) + value = getFirstValue(rodBinding); + + // add if not null + if ( value != null ) + results.add(value); + } + + return results; + } + + /** + * Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as + * a list of specific type T extending Feature. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBinding != null"}) + @Ensures("result != null") + public List getValues(final RodBinding rodBinding) { + return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false); + } + + /** + * Gets all of the Tribble features bound to any RodBinding in rodBindings, + * spanning this locus, returning them as a list of specific type T extending Feature. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null"}) + @Ensures("result != null") + public List getValues(final Collection> rodBindings) { + List results = new ArrayList(1); + for ( RodBinding rodBinding : rodBindings ) + results.addAll(getValues(rodBinding)); + return results; + } + + /** + * The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { + return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false); + } + + /** + * The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { + List results = new ArrayList(1); + for ( RodBinding rodBinding : rodBindings ) + results.addAll(getValues(rodBinding, onlyAtThisLoc)); + return results; + } + + /** + * Uses the same logic as @getValues(RodBinding) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param as above + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBinding != null"}) + public T getFirstValue(final RodBinding rodBinding) { + return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true)); + } + + /** + * Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true)); + } + + /** + * Uses the same logic as @getValues(List) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched + * @param as above + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBindings != null"}) + public T getFirstValue(final Collection> rodBindings) { + for ( RodBinding rodBinding : rodBindings ) { + T val = getFirstValue(rodBinding); + if ( val != null ) + return val; + } + return null; + } + + /** + * Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { + for ( RodBinding rodBinding : rodBindings ) { + T val = getFirstValue(rodBinding, onlyAtThisLoc); + if ( val != null ) + return val; + } + return null; + } + + /** + * Is there a binding at this site to a ROD/track with the specified name? + * + * @param rodBinding the rod binding we want to know about + * @return true if any Features are bound in this tracker to rodBinding + */ + @Requires({"rodBinding != null"}) + public boolean hasValues(final RodBinding rodBinding) { + return bindings.containsKey(canonicalName(rodBinding.getName())); + } + + /** + * Get all of the RMD tracks at the current site. Each track is returned as a single compound + * object (RODRecordList) that may contain multiple RMD records associated with the current site. + * + * @return List of all tracks + */ + public List getBoundRodTracks() { + return new ArrayList(bindings.values()); + } + + /** + * The number of tracks with at least one value bound here + * @return the number of tracks with at least one bound Feature + */ + public int getNTracksWithBoundFeatures() { + return bindings.size(); + } + + // ------------------------------------------------------------------------------------------ + // Protected accessors using strings for unit testing + // ------------------------------------------------------------------------------------------ + + protected boolean hasValues(final String name) { + return bindings.containsKey(canonicalName(name)); + } + + protected List getValues(final Class type, final String name) { + return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); + } + + protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); + } + + protected T getFirstValue(final Class type, final String name) { + return safeGetFirst(getValues(type, name)); + } + + protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(getValues(type, name, onlyAtThisLoc)); + } + + // ------------------------------------------------------------------------------------------ + // + // + // Private utility functions + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Helper function for getFirst() operations that takes a list of and + * returns the first element, or null if no such element exists. + * + * @param l + * @param + * @return + */ + @Requires({"l != null"}) + private T safeGetFirst(final List l) { + return l.isEmpty() ? null : l.get(0); + } + + private List addValues(final Collection names, + final Class type, + List values, + final GenomeLoc curLocation, + final boolean requireStartHere, + final boolean takeFirstOnly ) { + for ( String name : names ) { + RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match + values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly ); + if ( takeFirstOnly && ! values.isEmpty() ) + break; + } + + return values; + } + + + + private List addValues(final String name, + final Class type, + List values, + final RODRecordList rodList, + final GenomeLoc curLocation, + final boolean requireStartHere, + final boolean takeFirstOnly ) { + for ( GATKFeature rec : rodList ) { + if ( ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart() ) { // ok, we are going to keep this thing + Object obj = rec.getUnderlyingObject(); + if (!(type.isAssignableFrom(obj.getClass()))) + throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString() + + " it's of type " + obj.getClass()); + + T objT = (T)obj; + if ( takeFirstOnly ) { + if ( values == null ) + values = Arrays.asList(objT); + else + values.add(objT); + + break; + } else { + if ( values == null ) + values = new ArrayList(); + values.add(objT); + } + } + } + + return values == null ? Collections.emptyList() : values; + } + + /** + * Finds the reference metadata track named 'name' and returns all ROD records from that track associated + * with the current site as a RODRecordList List object. If no data track with specified name is available, + * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up + * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and + * location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution, + * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: + * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, + * regardless of the presence of "extended" RODs overlapping with that location). + * @param name track name + * @return track data for the given rod + */ + private RODRecordList getTrackDataByName(final String name) { + final String luName = canonicalName(name); + RODRecordList l = bindings.get(luName); + return l == null ? EMPTY_ROD_RECORD_LIST : l; + } + + private RODRecordList getTrackDataByName(final RodBinding binding) { + return getTrackDataByName(binding.getName()); + } + + /** + * Returns the canonical name of the rod name (lowercases it) + * @param name the name of the rod + * @return canonical name of the rod + */ + private String canonicalName(final String name) { + // todo -- remove me after switch to RodBinding syntax + return name.toLowerCase(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java new file mode 100644 index 000000000..fbbaa6636 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java @@ -0,0 +1,107 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broad.tribble.index.Index; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; + +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Utilities for working with Sequence Dictionaries embedded in tribble indices + * + * @author Your Name + * @since Date created + */ +public class IndexDictionaryUtils { + private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + index.addProperty(contig,length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java new file mode 100644 index 000000000..df5cf91ca --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -0,0 +1,418 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broad.tribble.AbstractFeatureReader; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.file.FSLockWithShared; +import org.broadinstitute.sting.utils.instrumentation.Sizeof; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + + +/** + * + * @author aaron + * ` + * Class RMDTrackBuilder + * + * This class keeps track of the available codecs, and knows how to put together a track of + * that gets iterators from the FeatureReader using Tribble. + * + */ +public class RMDTrackBuilder { // extends PluginManager { + /** + * our log, which we use to capture anything from this class + */ + private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + + // private sequence dictionary we use to set our tracks with + private final SAMSequenceDictionary dict; + + /** + * Private genome loc parser to use when building out new locs. + */ + private final GenomeLocParser genomeLocParser; + + /** + * Validation exclusions, for validating the sequence dictionary. + */ + private ValidationExclusion.TYPE validationExclusionType; + + private final FeatureManager featureManager; + + // If true, do not attempt to create index files if they don't exist or are outdated, and don't + // make any file lock acquisition calls on the index files. + private final boolean disableAutoIndexCreation; + + /** + * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally + * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, + * please talk through your approach with the SE team. + * @param dict Sequence dictionary to use. + * @param genomeLocParser Location parser to use. + * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. + * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. + * UNSAFE in general (because it causes us not to lock index files before reading them) -- + * suitable only for test suite use. + */ + public RMDTrackBuilder(final SAMSequenceDictionary dict, + final GenomeLocParser genomeLocParser, + final ValidationExclusion.TYPE validationExclusionType, + final boolean disableAutoIndexCreation) { + this.dict = dict; + this.validationExclusionType = validationExclusionType; + this.genomeLocParser = genomeLocParser; + this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); + this.disableAutoIndexCreation = disableAutoIndexCreation; + } + + /** + * Return the feature manager this RMDTrackBuilder is using the create tribble tracks + * + * @return + */ + public FeatureManager getFeatureManager() { + return featureManager; + } + + /** + * create a RMDTrack of the specified type + * + * @param fileDescriptor a description of the type of track to build. + * + * @return an instance of the track + */ + public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { + String name = fileDescriptor.getName(); + File inputFile = new File(fileDescriptor.getFile()); + + FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); + if (descriptor == null) + throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); + + // return a feature reader track + Pair pair; + if (inputFile.getAbsolutePath().endsWith(".gz")) + pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); + else + pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); + if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name)); + } + + /** + * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. + * @param codecClass Type of Tribble codec class to build. + * @param inputFile Input file type to use. + * @return An RMDTrack, suitable for accessing reference metadata. + */ + public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { + final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); + + if (descriptor == null) + throw new ReviewedStingException("Unable to find type name for codec class " + codecClass.getName()); + + return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); + } + + /** + * create a feature reader, without assuming there exists an index. This code assumes the feature + * reader of the appropriate type will figure out what the right index type is, and determine if it + * exists. + * + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create + * @param name the name of the track + * @param inputFile the file to load + * @return a feature reader implementation + */ + private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { + // we might not know the index type, try loading with the default reader constructor + logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); + try { + final File indexFile = null;//new File(inputFile.getAbsoluteFile() + TabixUtils.STANDARD_INDEX_EXTENSION); + final SAMSequenceDictionary dict = null; //TabixUtils.getSequenceDictionary(indexFile); + return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)), dict); + } catch (TribbleException e) { + throw new UserException(e.getMessage(), e); + } + } + + /** + * add a name to the codec, if it takes one + * @param descriptor the class to create a codec for + * @param name the name to assign this codec + * @return the feature codec itself + */ + private FeatureCodec createCodec(FeatureManager.FeatureDescriptor descriptor, String name) { + return featureManager.createCodec(descriptor, name, genomeLocParser); + } + + /** + * create a feature source object given: + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create + * @param name the name of the codec + * @param inputFile the tribble file to parse + * @param storageType How the RMD is streamed into the input file. + * @return the input file as a FeatureReader + */ + private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, + String name, + File inputFile, + RMDStorageType storageType) { + // Feature source and sequence dictionary to use as the ultimate reference + AbstractFeatureReader featureSource = null; + SAMSequenceDictionary sequenceDictionary = null; + + // Detect whether or not this source should be indexed. + boolean canBeIndexed = (storageType == RMDStorageType.FILE); + + if(canBeIndexed) { + try { + Index index = loadIndex(inputFile, createCodec(descriptor, name)); + try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } + catch (ReviewedStingException e) { } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + + // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match + if (sequenceDictionary.size() == 0 && dict != null) { + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + + if ( ! disableAutoIndexCreation ) { + File indexFile = Tribble.indexFile(inputFile); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); + } + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + } + + featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), index); + } + catch (TribbleException e) { + throw new UserException(e.getMessage()); + } + catch (IOException e) { + throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); + } + } + else { + featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), false); + } + + return new Pair(featureSource,sequenceDictionary); + } + + /** + * create an index for the input file + * @param inputFile the input file + * @param codec the codec to use + * @return a linear index for the specified type + * @throws IOException if we cannot write the index file + */ + public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { + final File indexFile = Tribble.indexFile(inputFile); + final FSLockWithShared lock = new FSLockWithShared(indexFile); + Index idx = null; + + // If the index file exists and is readable, attempt to load it from disk. We'll get null back + // if a problem was discovered with the index file when it was inspected, and we'll get an + // in-memory index back in the case where the index file could not be locked. + if (indexFile.canRead()) { + idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode + : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); + } + + // If we have an index, it means we either loaded it from disk without issue or we created an in-memory + // index due to not being able to acquire a lock. + if (idx != null) return idx; + + // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index + idx = createIndexInMemory(inputFile, codec); + if ( ! disableAutoIndexCreation ) { + writeIndexToDisk(idx, indexFile, lock); + } + return idx; + } + + /** + * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if + * a lock could not be obtained. Returns null if a problem was discovered with the index file when it + * was examined (eg., it was out-of-date). + * + * @param inputFile the input file + * @param codec the codec to read from + * @param indexFile the index file itself + * @param lock the lock file + * @return an index, or null if we couldn't load one + * @throws IOException if we fail for FS issues + */ + protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { + boolean locked = false; + Index idx = null; + + try { + locked = lock.sharedLock(); + + if ( ! locked ) { // can't lock file + logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", + indexFile.getAbsolutePath())); + idx = createIndexInMemory(inputFile, codec); + } + else { + idx = loadFromDisk(inputFile, indexFile); + } + } finally { + if (locked) lock.unlock(); + } + return idx; + } + + /** + * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) + * @param inputFile the input file + * @param indexFile the input file, plus the index extension + * @return an Index, or null if we're unable to load + */ + protected Index loadFromDisk( final File inputFile, final File indexFile ) { + logger.debug("Loading Tribble index from disk for file " + inputFile); + Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + // check if the file is up-to date (filestamp and version check) + if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) + return index; + else if (indexFile.lastModified() < inputFile.lastModified()) + logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); + else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable + logger.warn("Index file " + indexFile + " is out of date (old version), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); + + if ( ! disableAutoIndexCreation ) { + boolean deleted = indexFile.delete(); + if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); + } + + return null; + } + + + /** + * attempt to write the index to disk + * @param index the index to write to disk + * @param indexFile the index file location + * @param lock the locking object + * @throws IOException when unable to create the new index + */ + private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { + if ( disableAutoIndexCreation ) { + return; + } + + boolean locked = false; + + try { + locked = lock.exclusiveLock(); + + if (locked) { + logger.info("Writing Tribble index to disk for file " + indexFile); + LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + index.write(stream); + stream.close(); + } + else // we can't write it to disk, just store it in memory, tell them this + logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); + + try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } + catch ( ReviewedStingException e) { } + } + finally { + if (locked) lock.unlock(); + } + + } + + /** + * create the index in memory, given the input file and feature codec + * @param inputFile the input file + * @param codec the codec + * @return a LinearIndex, given the file location + * @throws IOException when unable to create the index in memory + */ + protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { + // this can take a while, let them know what we're doing + logger.debug("Creating Tribble index in memory for file " + inputFile); + Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); + return idx; + } + + /** + * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. + * (that each contig in the index is in the sequence dictionary). + * @param inputFile for proper error message formatting. + * @param dict the sequence dictionary + * @param index the index file + */ + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); + + // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); + + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); + } + + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReport.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReport.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumn.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumn.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportDataType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportDataType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportTable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportTable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportVersion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportVersion.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Affection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Affection.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Gender.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Gender.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Sample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Sample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Trio.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Trio.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Allows.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Allows.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Attribution.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Attribution.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/BAQMode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/BAQMode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/By.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/By.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/By.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/By.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Downsample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Downsample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/LocusWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/LocusWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplex.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplex.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplexer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplexer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionBy.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionBy.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RMD.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RMD.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadFilters.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadFilters.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RefWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RefWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Reference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Reference.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Requires.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Requires.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RodWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RodWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/TreeReducible.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/TreeReducible.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Walker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Walker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/WalkerName.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/WalkerName.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Window.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Window.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Window.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Window.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java new file mode 100644 index 000000000..2e38f5daa --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java @@ -0,0 +1,396 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.coverage; + +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; + + +/** + * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome + *

+ *

+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system + * considers the coverage at each locus and emits either a per base state or a summary interval BED file that + * partitions the genomic intervals into the following callable states: + *

+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
PASS
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
+ *

+ *

+ *

Input

+ *

+ * A BAM file containing exactly one sample. + *

+ *

+ *

Output

+ *

+ *

    + *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • + *
  • -summary: a table of callable status x count of all examined bases
  • + *
+ *

+ *

+ *

Examples

+ *
+ *  java -jar GenomeAnalysisTK.jar \
+ *     -T CallableLoci \
+ *     -I my.bam \
+ *     -summary my.summary \
+ *     -o my.bed
+ * 
+ *

+ * would produce a BED file (my.bed) that looks like: + *

+ *

+ *     20 10000000 10000864 PASS
+ *     20 10000865 10000985 POOR_MAPPING_QUALITY
+ *     20 10000986 10001138 PASS
+ *     20 10001139 10001254 POOR_MAPPING_QUALITY
+ *     20 10001255 10012255 PASS
+ *     20 10012256 10012259 POOR_MAPPING_QUALITY
+ *     20 10012260 10012263 PASS
+ *     20 10012264 10012328 POOR_MAPPING_QUALITY
+ *     20 10012329 10012550 PASS
+ *     20 10012551 10012551 LOW_COVERAGE
+ *     20 10012552 10012554 PASS
+ *     20 10012555 10012557 LOW_COVERAGE
+ *     20 10012558 10012558 PASS
+ *     et cetera...
+ * 
+ * as well as a summary table that looks like: + *

+ *

+ *                        state nBases
+ *                        REF_N 0
+ *                     PASS 996046
+ *                  NO_COVERAGE 121
+ *                 LOW_COVERAGE 928
+ *           EXCESSIVE_COVERAGE 0
+ *         POOR_MAPPING_QUALITY 2906
+ * 
+ * + * @author Mark DePristo + * @since May 7, 2010 + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +@By(DataSource.REFERENCE) +public class CallableLoci extends LocusWalker { + @Output + PrintStream out; + + /** + * Callable loci summary counts (see outputs) will be written to this file. + */ + @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) + File summaryFile; + + /** + * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but + * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with + * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where + * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to + * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states + */ + @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false) + byte maxLowMAPQ = 1; + + /** + * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS + * state. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) + byte minMappingQuality = 10; + + /** + * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) + byte minBaseQuality = 20; + + /** + * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this + * value and is less than maxDepth the site is considered PASS. + */ + @Advanced + @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) + int minDepth = 4; + + /** + * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH + */ + @Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) + int maxDepth = -1; + + /** + * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We + * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads + * covering the site. + */ + @Advanced + @Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false) + int minDepthLowMAPQ = 10; + + /** + * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality + * exceeds this fraction then the site has POOR_MAPPING_QUALITY. + */ + @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false) + double maxLowMAPQFraction = 0.1; + + /** + * The output of this walker will be written in this format. The recommended option is BED. + */ + @Advanced + @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false) + OutputFormat outputFormat = OutputFormat.BED; + + public enum OutputFormat { + /** + * The output will be written as a BED file. There's a BED element for each + * continuous run of callable states (i.e., PASS, REF_N, etc). This is the recommended + * format + */ + BED, + + /** + * Emit chr start stop state quads for each base. Produces a potentially disasterously + * large amount of output. + */ + STATE_PER_BASE + } + + public enum CalledState { + /** + * the reference base was an N, which is not considered callable the GATK + */ + REF_N, + /** + * the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE + */ + CALLABLE, + /** + * absolutely no reads were seen at this locus, regardless of the filtering parameters + */ + NO_COVERAGE, + /** + * there were less than min. depth bases at the locus, after applying filters + */ + LOW_COVERAGE, + /** + * more than -maxDepth read at the locus, indicating some sort of mapping problem + */ + EXCESSIVE_COVERAGE, + /** + * more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads + */ + POOR_MAPPING_QUALITY + } + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD WALKER METHODS + //////////////////////////////////////////////////////////////////////////////////// + + @Override + public boolean includeReadsWithDeletionAtLoci() { + return true; + } + + @Override + public void initialize() { + if (getSampleDB().getSamples().size() != 1) { + throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getSampleDB().getSamples()); + } + + try { + PrintStream summaryOut = new PrintStream(summaryFile); + summaryOut.close(); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(summaryFile, e); + } + } + + protected static class Integrator { + final long counts[] = new long[CalledState.values().length]; + CallableBaseState state = null; + } + + protected static class CallableBaseState implements HasGenomeLocation { + final public GenomeLocParser genomeLocParser; + public GenomeLoc loc; + final public CalledState state; + + public CallableBaseState(GenomeLocParser genomeLocParser, GenomeLoc loc, CalledState state) { + this.genomeLocParser = genomeLocParser; + this.loc = loc; + this.state = state; + } + + public GenomeLoc getLocation() { + return loc; + } + + public CalledState getState() { + return state; + } + + // update routines + public boolean changingState(CalledState newState) { + return state != newState; + } + + /** + * Updating the location of this CalledBaseState by the new stop location + * + * @param newStop + */ + public void update(GenomeLoc newStop) { + loc = genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), newStop.getStop()); + } + + public String toString() { + return String.format("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart()-1, loc.getStop(), state); + } + } + + @Override + public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + CalledState state; + + if ( BaseUtils.isNBase(ref.getBase())) { + state = CalledState.REF_N; + } else { + // count up the depths of all and QC+ bases + int rawDepth = 0, QCDepth = 0, lowMAPQDepth = 0; + for (PileupElement e : context.getBasePileup()) { + rawDepth++; + + if (e.getMappingQual() <= maxLowMAPQ) + lowMAPQDepth++; + + if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { + QCDepth++; + } + } + + //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth); + if (rawDepth == 0) { + state = CalledState.NO_COVERAGE; + } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) { + state = CalledState.POOR_MAPPING_QUALITY; + } else if (QCDepth < minDepth) { + state = CalledState.LOW_COVERAGE; + } else if (rawDepth >= maxDepth && maxDepth != -1) { + state = CalledState.EXCESSIVE_COVERAGE; + } else { + state = CalledState.CALLABLE; + } + } + + return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state); + } + + @Override + public Integrator reduceInit() { + return new Integrator(); + } + + @Override + public Integrator reduce(CallableBaseState state, Integrator integrator) { + // update counts + integrator.counts[state.getState().ordinal()]++; + + if (outputFormat == OutputFormat.STATE_PER_BASE) { + out.println(state.toString()); + } + + // format is integrating + if (integrator.state == null) + integrator.state = state; + else if (state.getLocation().getStart() != integrator.state.getLocation().getStop() + 1 || + integrator.state.changingState(state.getState())) { + out.println(integrator.state.toString()); + integrator.state = state; + } else { + integrator.state.update(state.getLocation()); + } + + return integrator; + } + + + //////////////////////////////////////////////////////////////////////////////////// + // INTERVAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + + @Override + public void onTraversalDone(Integrator result) { + // print out the last state + if (result != null) { + if (outputFormat == OutputFormat.BED) // get the last interval + out.println(result.state.toString()); + + try { + PrintStream summaryOut = new PrintStream(summaryFile); + summaryOut.printf("%30s %s%n", "state", "nBases"); + for (CalledState state : CalledState.values()) { + summaryOut.printf("%30s %d%n", state, result.counts[state.ordinal()]); + } + summaryOut.close(); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(summaryFile, e); + } + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java new file mode 100644 index 000000000..0d61af305 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -0,0 +1,241 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.coverage; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.util.*; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Mar 3, 2010 + */ +public class CoverageUtils { + + public enum CountPileupType { + /** + * Count all reads independently (even if from the same fragment). + */ + COUNT_READS, + /** + * Count all fragments (even if the reads that compose the fragment are not consistent at that base). + */ + COUNT_FRAGMENTS, + /** + * Count all fragments (but only if the reads that compose the fragment are consistent at that base). + */ + COUNT_FRAGMENTS_REQUIRE_SAME_BASE + } + + /** + * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context + * as an array of ints, indexed by the index fields of BaseUtils + * + * @param context + * @param minMapQ + * @param minBaseQ + * @return + */ + public static int[] getBaseCounts(AlignmentContext context, int minMapQ, int minBaseQ) { + int[] counts = new int[6]; + + for (PileupElement e : context.getBasePileup()) { + if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) { + updateCounts(counts,e); + } + } + + return counts; + } + + public static String getTypeID( SAMReadGroupRecord r, DoCOutputType.Partition type ) { + if ( type == DoCOutputType.Partition.sample ) { + return r.getSample(); + } else if ( type == DoCOutputType.Partition.readgroup ) { + return String.format("%s_rg_%s",r.getSample(),r.getReadGroupId()); + } else if ( type == DoCOutputType.Partition.library ) { + return r.getLibrary(); + } else if ( type == DoCOutputType.Partition.center ) { + return r.getSequencingCenter(); + } else if ( type == DoCOutputType.Partition.platform ) { + return r.getPlatform(); + } else if ( type == DoCOutputType.Partition.sample_by_center ) { + return String.format("%s_cn_%s",r.getSample(),r.getSequencingCenter()); + } else if ( type == DoCOutputType.Partition.sample_by_platform) { + return String.format("%s_pl_%s",r.getSample(),r.getPlatform()); + } else if ( type == DoCOutputType.Partition.sample_by_platform_by_center ) { + return String.format("%s_pl_%s_cn_%s",r.getSample(),r.getPlatform(),r.getSequencingCenter()); + } else { + throw new ReviewedStingException("Invalid type ID sent to getTypeID. This is a BUG!"); + } + } + + public static Map> + getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection types) { + + Map> countsByIDByType = new HashMap>(); + Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType); + for (DoCOutputType.Partition t : types ) { + // iterate through the read group counts and build the type associations + for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) { + String typeID = getTypeID(readGroupCountEntry.getKey(),t); + + if ( ! countsByIDByType.keySet().contains(t) ) { + countsByIDByType.put(t,new HashMap()); + } + + if ( ! countsByIDByType.get(t).keySet().contains(typeID) ) { + countsByIDByType.get(t).put(typeID,readGroupCountEntry.getValue().clone()); + } else { + addCounts(countsByIDByType.get(t).get(typeID),readGroupCountEntry.getValue()); + } + } + } + + + return countsByIDByType; + } + + public static void addCounts(int[] updateMe, int[] leaveMeAlone ) { + for ( int index = 0; index < leaveMeAlone.length; index++ ) { + updateMe[index] += leaveMeAlone[index]; + } + } + + public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) { + Map countsByRG = new HashMap(); + + List countPileup = new LinkedList(); + FragmentCollection fpile; + + switch (countType) { + + case COUNT_READS: + for (PileupElement e : context.getBasePileup()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + break; + + case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) { + countPileup.add(e); // add the first passing element per fragment + break; + } + } + } + break; + + case COUNT_FRAGMENTS_REQUIRE_SAME_BASE: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + PileupElement firstElem = null; + PileupElement addElem = null; + + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (firstElem == null) + firstElem = e; + else if (e.getBase() != firstElem.getBase()) { + addElem = null; + break; + } + + // will add the first passing element per base-consistent fragment: + if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + addElem = e; + } + + if (addElem != null) + countPileup.add(addElem); + } + break; + + default: + throw new UserException("Must use valid CountPileupType"); + } + + for (PileupElement e : countPileup) { + SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); + if (!countsByRG.keySet().contains(readGroup)) + countsByRG.put(readGroup, new int[6]); + + updateCounts(countsByRG.get(readGroup), e); + } + + return countsByRG; + } + + private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() )); + } + + private static void updateCounts(int[] counts, PileupElement e) { + if ( e.isDeletion() ) { + counts[BaseUtils.Base.D.ordinal()]++; + } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { + counts[BaseUtils.Base.N.ordinal()]++; + } else { + try { + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; + } catch (ArrayIndexOutOfBoundsException exc) { + throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase()); + } + } + } + + private static SAMReadGroupRecord getReadGroup(SAMRecord r) { + SAMReadGroupRecord rg = r.getReadGroup(); + if ( rg == null ) { + String msg = "Read "+r.getReadName()+" lacks read group information; Please associate all reads with read groups"; + throw new UserException.MalformedBAM(r, msg); + } + + return rg; + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java new file mode 100644 index 000000000..3a51a9a6a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -0,0 +1,1109 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.coverage; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Assess sequence coverage by a wide array of metrics, partitioned by sample, read group, or library + * + *

+ * This tool processes a set of bam files to determine coverage at different levels of partitioning and + * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by + * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, + * and/or percentage of bases covered to or beyond a threshold. + * Additionally, reads and bases can be filtered by mapping or base quality score. + * + *

Input

+ *

+ * One or more bam files (with proper headers) to be analyzed for coverage statistics + *

+ *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the online documentation) + *

+ *

Output

+ *

+ * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + *

+ * - no suffix: per locus coverage + *

+ * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + *

+ * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + *

+ * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + *

+ * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + *

+ * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + *

+ * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + *

+ * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + *

+ * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T DepthOfCoverage \
+ *   -o file_name_base \
+ *   -I input_bams.list
+ *   [-geneList refSeq.sorted.txt] \
+ *   [-pt readgroup] \
+ *   [-ct 4 -ct 6 -ct 10] \
+ *   [-L my_capture_genes.interval_list]
+ * 
+ * + */ +// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time +// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) +// todo -- alter logarithmic scaling to spread out bins more +// todo -- allow for user to set linear binning (default is logarithmic) +// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.MC) +@By(DataSource.REFERENCE) +@PartitionBy(PartitionType.NONE) +@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE) +public class DepthOfCoverage extends LocusWalker>, CoveragePartitioner> implements TreeReducible { + @Output + @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) + Map out; + /** + * Reads with mapping quality values lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE) + int minMappingQuality = -1; + /** + * Reads with mapping quality values higher than this threshold will be skipped. The default value is the largest number that can be represented as an integer by the program. + */ + @Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE) + int maxMappingQuality = Integer.MAX_VALUE; + /** + * Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE) + byte minBaseQuality = -1; + /** + * Bases with quality scores higher than this threshold will be skipped. The default value is the largest number that can be represented as a byte. + */ + @Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE) + byte maxBaseQuality = Byte.MAX_VALUE; + + @Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false) + CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS; + + /** + * Instead of reporting depth, the program will report the base pileup at each locus + */ + @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Add base counts to per-locus output", required = false) + boolean printBaseCounts = false; + + /** + * Disabling the tabulation of locus statistics (# loci covered by sample by coverage) should speed up processing. + */ + @Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Do not calculate per-sample per-depth counts of loci", required = false) + boolean omitLocusTable = false; + + /** + * Disabling the tabulation of interval statistics (mean, median, quartiles AND # intervals by sample by coverage) should speed up processing. This option is required in order to use -nt parallelism. + */ + @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Do not calculate per-interval statistics", required = false) + boolean omitIntervals = false; + /** + * Disabling the tabulation of total coverage at every base should speed up processing. + */ + @Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Do not output depth of coverage at each base", required = false) + boolean omitDepthOutput = false; + + /** + * Specify a RefSeq file for use in aggregating coverage statistics over genes. + */ + @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate coverage statistics over this list of genes", required = false) + File refSeqGeneList = null; + + /** + * Output file format (e.g. csv, table, rtable); defaults to r-readable table. + */ + @Argument(fullName = "outputFormat", doc = "The format of the output file", required = false) + String outputFormat = "rtable"; + + + // --------------------------------------------------------------------------- + // + // Advanced arguments + // + // --------------------------------------------------------------------------- + + /** + * Normally, sites where the reference is N (or another non-canonical base) are skipped. If this option is enabled, these sites will be included in DoC calculations if there is coverage from neighboring reads. + */ + @Advanced + @Argument(fullName = "includeRefNSites", doc = "Include sites where the reference is N", required = false) + boolean includeRefNBases = false; + /** + * Use this option to calibrate what bins you want before performing full calculations on your data. + */ + @Advanced + @Argument(fullName = "printBinEndpointsAndExit", doc = "Print the bin values and exit immediately", required = false) + boolean printBinEndpointsAndExit = false; + /** + * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. + */ + @Advanced + @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false, minValue = 0) + int start = 1; + /** + * Sets the high-coverage cutoff for granular binning. All loci with depth > STOP are counted in the last bin. + */ + @Advanced + @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false, minValue = 1) + int stop = 500; + /** + * Sets the number of bins for granular binning + */ + @Advanced + @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false, minValue = 0, minRecommendedValue = 1) + int nBins = 499; + + /** + * This option simply disables writing separate files for per-sample summary statistics (total, mean, median, quartile coverage per sample). These statistics are still calculated internally, so enabling this option will not improve runtime. + */ + @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Do not output the summary files per-sample", required = false) + boolean omitSampleSummary = false; + /** + * By default, coverage is partitioning by sample, but it can be any combination of sample, readgroup and/or library. + */ + @Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage", required = false) + Set partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); + + /** + * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. + */ + @Advanced + @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) + boolean includeDeletions = false; + + @Advanced + @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) + boolean ignoreDeletionSites = false; + + /** + * For summary file outputs, report the percentage of bases covered to an amount equal to or greater than this number (e.g. % bases >= CT for each sample). Defaults to 15; can take multiple arguments. + */ + @Advanced + @Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "Coverage threshold (in percent) for summarizing statistics", required = false) + int[] coverageThresholds = {15}; + + String[] OUTPUT_FORMATS = {"table","rtable","csv"}; + String separator = "\t"; + Map> orderCheck = new HashMap>(); + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD WALKER METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } + + public void initialize() { + + if ( printBinEndpointsAndExit ) { + int[] endpoints = DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins); + System.out.print("[ "); + for ( int e : endpoints ) { + System.out.print(e+" "); + } + System.out.println("]"); + System.exit(0); + } + + // Check the output format + boolean goodOutputFormat = false; + for ( String f : OUTPUT_FORMATS ) { + goodOutputFormat = goodOutputFormat || f.equals(outputFormat); + } + + if ( ! goodOutputFormat ) { + throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat); + } + + if ( outputFormat.equals("csv") ) { + separator = ","; + } + + if ( ! omitDepthOutput ) { // print header + PrintStream out = getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary); + out.printf("%s\t%s","Locus","Total_Depth"); + for (DoCOutputType.Partition type : partitionTypes ) { + out.printf("\t%s_%s","Average_Depth",type.toString()); + } + + // get all the samples + HashSet allSamples = getSamplesFromToolKit(partitionTypes); + ArrayList allSampleList = new ArrayList(allSamples.size()); + for ( String s : allSamples ) { + allSampleList.add(s); + } + Collections.sort(allSampleList); + + for ( String s : allSampleList) { + out.printf("\t%s_%s","Depth_for",s); + if ( printBaseCounts ) { + out.printf("\t%s_%s",s,"base_counts"); + } + } + + out.printf("%n"); + + } else { + logger.info("Per-Locus Depth of Coverage output was omitted"); + } + + for (DoCOutputType.Partition type : partitionTypes ) { + orderCheck.put(type,new ArrayList()); + for ( String id : getSamplesFromToolKit(type) ) { + orderCheck.get(type).add(id); + } + Collections.sort(orderCheck.get(type)); + } + } + + private HashSet getSamplesFromToolKit( Collection types ) { + HashSet partitions = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order + for (DoCOutputType.Partition t : types ) { + partitions.addAll(getSamplesFromToolKit(t)); + } + + return partitions; + } + + private HashSet getSamplesFromToolKit(DoCOutputType.Partition type) { + HashSet partition = new HashSet(); + if ( type == DoCOutputType.Partition.sample ) { + partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); + } else if ( type == DoCOutputType.Partition.readgroup ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); + } + } else if ( type == DoCOutputType.Partition.library ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getLibrary()); + } + } else if ( type == DoCOutputType.Partition.center ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getSequencingCenter()); + } + } else if ( type == DoCOutputType.Partition.platform ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getPlatform()); + } + } else if ( type == DoCOutputType.Partition.sample_by_center ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(String.format("%s_cn_%s",rg.getSample(),rg.getSequencingCenter())); + } + } else if ( type == DoCOutputType.Partition.sample_by_platform ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(String.format("%s_pl_%s",rg.getSample(),rg.getPlatform())); + } + } else if ( type == DoCOutputType.Partition.sample_by_platform_by_center ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(String.format("%s_pl_%s_cn_%s",rg.getSample(),rg.getPlatform(),rg.getSequencingCenter())); + } + } else { + throw new ReviewedStingException("Invalid aggregation type sent to getSamplesFromToolKit"); + } + + return partition; + } + + public boolean isReduceByInterval() { + return ( ! omitIntervals ); + } + + public CoveragePartitioner reduceInit() { + CoveragePartitioner aggro = new CoveragePartitioner(partitionTypes,start,stop,nBins); + for (DoCOutputType.Partition t : partitionTypes ) { + aggro.addIdentifiers(t,getSamplesFromToolKit(t)); + } + aggro.initialize(includeDeletions,omitLocusTable); + checkOrder(aggro); + return aggro; + } + + public Map> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) { + if ( ! omitDepthOutput ) { + getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) + //System.out.printf("\t[log]\t%s",ref.getLocus()); + } + + return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes); + } else { + return null; + } + } + + public CoveragePartitioner reduce(Map> thisMap, CoveragePartitioner prevReduce) { + if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns) + if ( ! omitDepthOutput ) { + //checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order + printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType()); + // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without + // turning on omit + } + + prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object + } + + return prevReduce; + } + + public CoveragePartitioner treeReduce(CoveragePartitioner left, CoveragePartitioner right) { + left.merge(right); + return left; + } + + //////////////////////////////////////////////////////////////////////////////////// + // INTERVAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + + public void onTraversalDone( List> statsByInterval ) { + if ( refSeqGeneList != null && partitionTypes.contains(DoCOutputType.Partition.sample) ) { + printGeneStats(statsByInterval); + } + + if ( statsByInterval.size() > 0 ) { + for(DoCOutputType.Partition partition: partitionTypes) { + if ( checkType(statsByInterval.get(0).getSecond().getCoverageByAggregationType(partition) ,partition) ) { + printIntervalStats(statsByInterval, + getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary), + getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics), + partition); + } else { + throw new ReviewedStingException("Partition type "+partition.toString()+" had no entries. Please check that your .bam header has all appropriate partition types."); + } + } + } else { + throw new UserException.CommandLineException("Cannot reduce by interval without a list of intervals. Please provide an interval list using the -L argument."); + } + + onTraversalDone(mergeAll(statsByInterval)); + + } + + public CoveragePartitioner mergeAll(List> stats) { + CoveragePartitioner first = stats.remove(0).second; + for ( Pair iStat : stats ) { + treeReduce(first,iStat.second); + } + + return first; + } + + private DepthOfCoverageStats printIntervalStats(List> statsByInterval, PrintStream summaryOut, PrintStream statsOut, DoCOutputType.Partition type) { + Pair firstPair = statsByInterval.get(0); + CoveragePartitioner firstAggregator = firstPair.second; + DepthOfCoverageStats firstStats = firstAggregator.getCoverageByAggregationType(type); + + StringBuilder summaryHeader = new StringBuilder(); + summaryHeader.append("Target"); + summaryHeader.append(separator); + summaryHeader.append("total_coverage"); + summaryHeader.append(separator); + summaryHeader.append("average_coverage"); + + for ( String s : firstStats.getAllSamples() ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_total_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_mean_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q1"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_median"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q3"); + for ( int thresh : coverageThresholds ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_%_above_"); + summaryHeader.append(thresh); + } + } + + summaryOut.printf("%s%n",summaryHeader); + + int[][] nTargetsByAvgCvgBySample = new int[firstStats.getHistograms().size()][firstStats.getEndpoints().length+1]; + + for ( Pair targetAggregator : statsByInterval ) { + + Pair targetStats = new Pair( + targetAggregator.first, targetAggregator.second.getCoverageByAggregationType(type)); + printTargetSummary(summaryOut,targetStats); + updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); + } + + printIntervalTable(statsOut,nTargetsByAvgCvgBySample,firstStats.getEndpoints()); + + return firstStats; + } + + private void printGeneStats(List> statsByTarget) { + logger.debug("statsByTarget size is "+Integer.toString(statsByTarget.size())); + logger.debug("Initializing refseq..."); + LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq(); + logger.debug("Refseq init done."); + List> statsByGene = new ArrayList>();// maintains order + Map geneNamesToStats = new HashMap(); // allows indirect updating of objects in list + + for ( Pair targetStats : statsByTarget ) { + String gene = getGeneName(targetStats.first,refseqIterator); + if ( geneNamesToStats.keySet().contains(gene) ) { + logger.debug("Merging "+geneNamesToStats.get(gene).toString()+" and "+targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample).toString()); + geneNamesToStats.get(gene).merge(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); + } else { + DepthOfCoverageStats merger = new DepthOfCoverageStats(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); + geneNamesToStats.put(gene,merger); + statsByGene.add(new Pair(gene,merger)); + } + } + + PrintStream geneSummaryOut = getCorrectStream(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); + StringBuilder summaryHeader = new StringBuilder(); + summaryHeader.append("Gene"); + summaryHeader.append(separator); + summaryHeader.append("total_coverage"); + summaryHeader.append(separator); + summaryHeader.append("average_coverage"); + + for ( String s : statsByTarget.get(0).second.getCoverageByAggregationType(DoCOutputType.Partition.sample).getAllSamples() ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_total_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_mean_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q1"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_median"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q3"); + for ( int thresh : coverageThresholds ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_%_above_"); + summaryHeader.append(thresh); + } + } + + geneSummaryOut.printf("%s%n",summaryHeader); + + for ( Pair geneStats : statsByGene ) { + printTargetSummary(geneSummaryOut,geneStats); + } + } + + //blatantly stolen from Andrew Kernytsky + private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) { + logger.debug("Examining "+target.toString()); + if (refseqIterator == null) { return "UNKNOWN"; } + + RODRecordList annotationList = refseqIterator.seekForward(target); + logger.debug("Annotation list is " + (annotationList == null ? "null" : annotationList.getName())); + if (annotationList == null) { return "UNKNOWN"; } + + for(GATKFeature rec : annotationList) { + if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) { + logger.debug("We do overlap "+ rec.getUnderlyingObject().toString()); + return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName(); + } + logger.debug("No overlap"); + } + + return "UNKNOWN"; + + } + + private LocationAwareSeekableRODIterator initializeRefSeq() { + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + getToolkit().getArguments().unsafe, + getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); + RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,refSeqGeneList); + return new SeekableRODIterator(refseq.getHeader(),refseq.getSequenceDictionary(),getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(),refseq.getIterator()); + } + + private void printTargetSummary(PrintStream output, Pair intervalStats) { + DepthOfCoverageStats stats = intervalStats.second; + int[] bins = stats.getEndpoints(); + + StringBuilder targetSummary = new StringBuilder(); + targetSummary.append(intervalStats.first.toString()); + targetSummary.append(separator); + targetSummary.append(stats.getTotalCoverage()); + targetSummary.append(separator); + targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage())); + + for ( String s : stats.getAllSamples() ) { + targetSummary.append(separator); + targetSummary.append(stats.getTotals().get(s)); + targetSummary.append(separator); + targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); + targetSummary.append(separator); + int median = getQuantile(stats.getHistograms().get(s),0.5); + int q1 = getQuantile(stats.getHistograms().get(s),0.25); + int q3 = getQuantile(stats.getHistograms().get(s),0.75); + targetSummary.append(formatBin(bins,q1)); + targetSummary.append(separator); + targetSummary.append(formatBin(bins,median)); + targetSummary.append(separator); + targetSummary.append(formatBin(bins,q3)); + for ( int thresh : coverageThresholds ) { + targetSummary.append(String.format("%s%.1f",separator,getPctBasesAbove(stats.getHistograms().get(s),stats.value2bin(thresh)))); + } + + } + + output.printf("%s%n", targetSummary); + } + + private String formatBin(int[] bins, int quartile) { + if ( quartile >= bins.length ) { + return String.format(">%d",bins[bins.length-1]); + } else if ( quartile < 0 ) { + return String.format("<%d",bins[0]); + } else { + return String.format("%d",bins[quartile]); + } + } + + private void printIntervalTable(PrintStream output, int[][] intervalTable, int[] cutoffs) { + String colHeader = outputFormat.equals("rtable") ? "" : "Number_of_sources"; + output.printf(colHeader + separator+"depth>=%d",0); + for ( int col = 0; col < intervalTable[0].length-1; col ++ ) { + output.printf(separator+"depth>=%d",cutoffs[col]); + } + + output.printf(String.format("%n")); + for ( int row = 0; row < intervalTable.length; row ++ ) { + output.printf("At_least_%d_samples",row+1); + for ( int col = 0; col < intervalTable[0].length; col++ ) { + output.printf(separator+"%d",intervalTable[row][col]); + } + output.printf(String.format("%n")); + } + } + + /* + * @updateTargetTable + * The idea is to have counts for how many *targets* have at least K samples with + * median coverage of at least X. + * To that end: + * Iterate over the samples the DOCS object, determine how many there are with + * median coverage > leftEnds[0]; how many with median coverage > leftEnds[1] + * and so on. Then this target has at least N, N-1, N-2, ... 1, 0 samples covered + * to leftEnds[0] and at least M,M-1,M-2,...1,0 samples covered to leftEnds[1] + * and so on. + */ + private void updateTargetTable(int[][] table, DepthOfCoverageStats stats) { + int[] cutoffs = stats.getEndpoints(); + int[] countsOfMediansAboveCutoffs = new int[cutoffs.length+1]; // 0 bin to catch everything + for ( int i = 0; i < countsOfMediansAboveCutoffs.length; i ++) { + countsOfMediansAboveCutoffs[i]=0; + } + + for ( String s : stats.getAllSamples() ) { + int medianBin = getQuantile(stats.getHistograms().get(s),0.5); + for ( int i = 0; i <= medianBin; i ++) { + countsOfMediansAboveCutoffs[i]++; + } + } + + for ( int medianBin = 0; medianBin < countsOfMediansAboveCutoffs.length; medianBin++) { + for ( ; countsOfMediansAboveCutoffs[medianBin] > 0; countsOfMediansAboveCutoffs[medianBin]-- ) { + table[countsOfMediansAboveCutoffs[medianBin]-1][medianBin]++; + // the -1 is due to counts being 1-based and offsets being 0-based + } + } + } + + //////////////////////////////////////////////////////////////////////////////////// + // FINAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + + public void onTraversalDone(CoveragePartitioner coverageProfiles) { + /////////////////// + // OPTIONAL OUTPUTS + ////////////////// + + if ( ! omitSampleSummary ) { + logger.info("Printing summary info"); + for (DoCOutputType.Partition type : partitionTypes ) { + outputSummaryFiles(coverageProfiles,type); + } + } + + if ( ! omitLocusTable ) { + logger.info("Printing locus summary"); + for (DoCOutputType.Partition type : partitionTypes ) { + outputLocusFiles(coverageProfiles,type); + } + } + } + + private void outputLocusFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { + printPerLocus(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts), + getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions), + coverageProfiles.getCoverageByAggregationType(type),type); + } + + private void outputSummaryFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { + printPerSample(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics),coverageProfiles.getCoverageByAggregationType(type)); + printSummary(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary),coverageProfiles.getCoverageByAggregationType(type)); + } + + //////////////////////////////////////////////////////////////////////////////////// + // HELPER OUTPUT METHODS + //////////////////////////////////////////////////////////////////////////////////// + + private void printPerSample(PrintStream output,DepthOfCoverageStats stats) { + int[] leftEnds = stats.getEndpoints(); + + StringBuilder hBuilder = new StringBuilder(); + if ( ! outputFormat.equals("rTable")) { + hBuilder.append("Source_of_reads"); + } + hBuilder.append(separator); + hBuilder.append(String.format("from_0_to_%d)%s",leftEnds[0],separator)); + for ( int i = 1; i < leftEnds.length; i++ ) + hBuilder.append(String.format("from_%d_to_%d)%s",leftEnds[i-1],leftEnds[i],separator)); + hBuilder.append(String.format("from_%d_to_inf%n",leftEnds[leftEnds.length-1])); + output.print(hBuilder.toString()); + Map histograms = stats.getHistograms(); + + for ( Map.Entry p : histograms.entrySet() ) { + StringBuilder sBuilder = new StringBuilder(); + sBuilder.append(String.format("sample_%s",p.getKey())); + for ( long count : p.getValue() ) { + sBuilder.append(String.format("%s%d",separator,count)); + } + sBuilder.append(String.format("%n")); + output.print(sBuilder.toString()); + } + } + + private void printPerLocus(PrintStream output, PrintStream coverageOut, DepthOfCoverageStats stats, DoCOutputType.Partition partitionType) { + int[] endpoints = stats.getEndpoints(); + int samples = stats.getHistograms().size(); + + long[][] baseCoverageCumDist = stats.getLocusCounts(); + + // rows - # of samples + // columns - depth of coverage + + boolean printSampleColumnHeader = outputFormat.equals("csv") || outputFormat.equals("table"); + + StringBuilder header = new StringBuilder(); + if ( printSampleColumnHeader ) { + // mhanna 22 Aug 2010 - Deliberately force this header replacement to make sure integration tests pass. + // TODO: Update integration tests and get rid of this. + header.append(partitionType == DoCOutputType.Partition.readgroup ? "read_group" : partitionType.toString()); + } + header.append(String.format("%sgte_0",separator)); + for ( int d : endpoints ) { + header.append(String.format("%sgte_%d",separator,d)); + } + header.append(String.format("%n")); + + output.print(header); + coverageOut.print(header); + + for ( int row = 0; row < samples; row ++ ) { + output.printf("%s_%d","NSamples",row+1); + for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) { + output.printf("%s%d",separator,baseCoverageCumDist[row][depthBin]); + } + output.printf("%n"); + } + + for ( String sample : stats.getAllSamples() ) { + coverageOut.printf("%s",sample); + double[] coverageDistribution = stats.getCoverageProportions(sample); + for ( int bin = 0; bin < coverageDistribution.length; bin ++ ) { + coverageOut.printf("%s%.2f",separator,coverageDistribution[bin]); + } + coverageOut.printf("%n"); + } + } + + private PrintStream getCorrectStream(DoCOutputType.Partition partition, DoCOutputType.Aggregation aggregation, DoCOutputType.FileType fileType) { + DoCOutputType outputType = new DoCOutputType(partition,aggregation,fileType); + if(!out.containsKey(outputType)) + throw new UserException.CommandLineException(String.format("Unable to find appropriate stream for partition = %s, aggregation = %s, file type = %s",partition,aggregation,fileType)); + return out.get(outputType); + } + + private void printSummary(PrintStream output, DepthOfCoverageStats stats) { + if ( ! outputFormat.equals("csv") ) { + output.printf("%s\t%s\t%s\t%s\t%s\t%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); + } else { + output.printf("%s,%s,%s,%s,%s,%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); + } + + for ( int thresh : coverageThresholds ) { + output.printf("%s%s%d",separator,"%_bases_above_",thresh); + } + + output.printf("%n"); + + Map histograms = stats.getHistograms(); + Map means = stats.getMeans(); + Map totals = stats.getTotals(); + int[] leftEnds = stats.getEndpoints(); + + for ( Map.Entry p : histograms.entrySet() ) { + String s = p.getKey(); + long[] histogram = p.getValue(); + int median = getQuantile(histogram,0.5); + int q1 = getQuantile(histogram,0.25); + int q3 = getQuantile(histogram,0.75); + // if any of these are larger than the higest bin, put the median as in the largest bin + median = median == histogram.length-1 ? histogram.length-2 : median; + q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; + q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; + if ( ! outputFormat.equals("csv") ) { + output.printf("%s\t%d\t%.2f\t%d\t%d\t%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); + } else { + output.printf("%s,%d,%.2f,%d,%d,%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); + } + + for ( int thresh : coverageThresholds ) { + output.printf("%s%.1f",separator,getPctBasesAbove(histogram,stats.value2bin(thresh))); + } + + output.printf("%n"); + } + + if ( ! outputFormat.equals("csv") ) { + output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); + } else { + output.printf("%s,%d,%.2f,%s,%s,%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); + } + } + + private int getQuantile(long[] histogram, double prop) { + int total = 0; + + for ( int i = 0; i < histogram.length; i ++ ) { + total += histogram[i]; + } + + int counts = 0; + int bin = -1; + while ( counts < prop*total ) { + counts += histogram[bin+1]; + bin++; + } + + return bin == -1 ? 0 : bin; + } + + private double getPctBasesAbove(long[] histogram, int bin) { + long below = 0l; + long above = 0l; + for ( int index = 0; index < histogram.length; index++) { + if ( index < bin ) { + below+=histogram[index]; + } else { + above+=histogram[index]; + } + } + + return 100*( (double) above )/( above + below ); + } + + private void printDepths(PrintStream stream, Map> countsBySampleByType, Map> identifiersByType) { + // get the depths per sample and build up the output string while tabulating total and average coverage + StringBuilder perSampleOutput = new StringBuilder(); + int tDepth = 0; + boolean depthCounted = false; + for (DoCOutputType.Partition type : partitionTypes ) { + Map countsByID = countsBySampleByType.get(type); + for ( String s : identifiersByType.get(type) ) { + perSampleOutput.append(separator); + long dp = (countsByID != null && countsByID.keySet().contains(s)) ? sumArray(countsByID.get(s)) : 0 ; + perSampleOutput.append(dp); + if ( printBaseCounts ) { + perSampleOutput.append(separator); + perSampleOutput.append(baseCounts(countsByID != null ? countsByID.get(s) : null )); + } + if ( ! depthCounted ) { + tDepth += dp; + } + } + depthCounted = true; // only sum the total depth once + } + + // remember -- genome locus was printed in map() + stream.printf("%s%d",separator,tDepth); + for (DoCOutputType.Partition type : partitionTypes ) { + stream.printf("%s%.2f",separator, ( (double) tDepth / identifiersByType.get(type).size() ) ); + } + stream.printf("%s%n",perSampleOutput); + } + + private long sumArray(int[] array) { + long i = 0; + for ( int j : array ) { + i += j; + } + return i; + } + + private String baseCounts(int[] counts) { + if ( counts == null ) { + counts = new int[6]; + } + StringBuilder s = new StringBuilder(); + int nbases = 0; + for ( byte b : BaseUtils.EXTENDED_BASES ) { + nbases++; + if ( includeDeletions || b != BaseUtils.Base.D.base ) { + s.append((char)b); + s.append(":"); + s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]); + if ( nbases < 6 ) { + s.append(" "); + } + } + } + + return s.toString(); + } + + private void checkOrder(CoveragePartitioner ag) { + // make sure the ordering stored at initialize() is propagated along reduce + for (DoCOutputType.Partition t : partitionTypes ) { + List order = orderCheck.get(t); + List namesInAg = ag.getIdentifiersByType().get(t); + + // todo -- chris check me + Set namesInDOCS = ag.getCoverageByAggregationType(t).getAllSamples(); + int index = 0; + for ( String s : namesInAg ) { + if ( ! s.equalsIgnoreCase(order.get(index)) ) { + throw new ReviewedStingException("IDs are out of order for type "+t+"! Aggregator has different ordering"); + } + index++; + } + } + } + + public boolean checkType(DepthOfCoverageStats stats, DoCOutputType.Partition type ) { + if ( stats.getHistograms().size() < 1 ) { + logger.warn("The histogram per partition type "+type.toString()+" was empty\n"+ + "Do your read groups have this type? (Check your .bam header)."); + return false; + } else { + return true; + } + } + +} + +class DoCOutputMultiplexer implements Multiplexer { + private final Set partitions; + private final File refSeqGeneList; + private final boolean omitDepthOutput; + private final boolean omitIntervals; + private final boolean omitSampleSummary; + private final boolean omitLocusTable; + + /** + * Create a new multiplexer type using the values of all variable fields. + * @param partitions + * @param refSeqGeneList + * @param omitDepthOutput + * @param omitIntervals + * @param omitSampleSummary + * @param omitLocusTable + */ + public DoCOutputMultiplexer(final Set partitions, + final File refSeqGeneList, + final boolean omitDepthOutput, + final boolean omitIntervals, + final boolean omitSampleSummary, + final boolean omitLocusTable) { + this.partitions = partitions; + this.refSeqGeneList = refSeqGeneList; + this.omitDepthOutput = omitDepthOutput; + this.omitIntervals = omitIntervals; + this.omitSampleSummary = omitSampleSummary; + this.omitLocusTable = omitLocusTable; + } + + public Collection multiplex() { + List outputs = new ArrayList(); + if(!omitDepthOutput) outputs.add(new DoCOutputType(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary)); + + if(!omitIntervals) { + for(DoCOutputType.Partition partition: partitions) { + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary)); + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics)); + } + } + + if(refSeqGeneList != null && partitions.contains(DoCOutputType.Partition.sample)) { + DoCOutputType geneSummaryOut = new DoCOutputType(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); + outputs.add(geneSummaryOut); + } + + if(!omitSampleSummary) { + for(DoCOutputType.Partition partition: partitions) { + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary)); + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics)); + } + } + + if(!omitLocusTable) { + for(DoCOutputType.Partition partition: partitions) { + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts)); + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions)); + } + } + + return outputs; + } + + public String transformArgument(final DoCOutputType outputType, final String argument) { + return outputType.getFileName(argument); + } + +} + +class CoveragePartitioner { + private Collection types; + private Map coverageProfiles; + private Map> identifiersByType; + private Set allIdentifiers; + public CoveragePartitioner(Collection typesToUse, int start, int stop, int nBins) { + coverageProfiles = new HashMap(); + identifiersByType = new HashMap>(); + types = typesToUse; + for ( DoCOutputType.Partition type : types ) { + coverageProfiles.put(type,new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins))); + identifiersByType.put(type,new ArrayList()); + } + allIdentifiers = new HashSet(); + } + + public void merge(CoveragePartitioner otherAggregator) { + for ( DoCOutputType.Partition type : types ) { + this.coverageProfiles.get(type).merge(otherAggregator.coverageProfiles.get(type)); + } + } + + public DepthOfCoverageStats getCoverageByAggregationType(DoCOutputType.Partition t) { + return coverageProfiles.get(t); + } + + public void addIdentifiers(DoCOutputType.Partition t, Set ids) { + for ( String s : ids ) { + coverageProfiles.get(t).addSample(s); + identifiersByType.get(t).add(s); + allIdentifiers.add(s); + } + Collections.sort(identifiersByType.get(t)); + } + + public void initialize(boolean useDels, boolean omitLocusTable) { + for ( DoCOutputType.Partition t : types ) { + if ( useDels ) { + coverageProfiles.get(t).initializeDeletions(); + } + if ( ! omitLocusTable ) { + coverageProfiles.get(t).initializeLocusCounts(); + } + } + } + + public void update(Map> countsByIdentifierByType) { + for ( DoCOutputType.Partition t : types ) { + coverageProfiles.get(t).update(countsByIdentifierByType.get(t)); + } + } + + public Set getAllIdentifiers() { + return allIdentifiers; + } + + public Map> getIdentifiersByType() { + return identifiersByType; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/package-info.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java new file mode 100644 index 000000000..b6a3853f8 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java @@ -0,0 +1,258 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.Arrays; + +/** + * Compare GATK's internal pileup to a reference Samtools pileup + * + *

At every locus in the input set, compares the pileup data (reference base, aligned base from + * each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated + * by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly + * different output format by default. + *

+ * + *

Format

+ *

There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old + * 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.

+ *

Simple pileup: 6-column format

+ *

+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

+ *
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * 
+ *

+ * See the Pileup format documentation for more details. + *

+ * + *

Consensus pileup: 10/13-column format

+ *

The "consensus" or extended pileup consists of the following: + *

    + *
  • original 6 columns as described above
  • + *
  • 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the + * reads covering the sites) for all sites, inserted before the bases and quality strings
  • + *
  • 3 extra columns indicating counts of reads supporting indels (just for indel sites)
  • + *
+ *

+ *

Example of consensus pileup for SNP or non-variant sites

+ *
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * 
+ * + *

Example of consensus pileup for indels

+ *
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * 
+ *

+ * See Consensus pileup format (deprecated) for more details. + *

+ * + *

Input

+ *

A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you + * want to examine.

+ * + *

Output

+ *

A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.

+ * + *

Example

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T CheckPileup \
+ *   -R ref.fasta \
+ *   -I your_data.bam \
+ *   --pileup:SAMPileup pileup_file.txt \
+ *   -L chr1:257-275 \
+ *   -o output_file_name
+ * 
+ */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +@Requires(value={DataSource.READS,DataSource.REFERENCE}) +public class CheckPileup extends LocusWalker implements TreeReducible { + /** + * This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval. + */ + @Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true) + RodBinding pileup; + + @Output + private PrintStream out; + /** + * By default the program will quit if it encounters an error (such as missing truth data for a given position). + * Use this flag to override the default behavior; the program will then simply print an error message and move on + * to the next position. + */ + @Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false) + public boolean CONTINUE_AFTER_AN_ERROR = false; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + SAMPileupFeature truePileup = getTruePileup( tracker ); + + if ( truePileup == null ) { + out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar())); + if ( ! CONTINUE_AFTER_AN_ERROR ) { + throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", + context.getLocation(), new String(pileup.getBases()))); + } + } else { + String pileupDiff = pileupDiff(pileup, truePileup, true); + if ( pileupDiff != null ) { + out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString()); + if ( ! CONTINUE_AFTER_AN_ERROR ) { + throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff)); + } + } + } + + return pileup.getNumberOfElements(); + } + + private static String maybeSorted( final String x, boolean sortMe ) + { + if ( sortMe ) { + byte[] bytes = x.getBytes(); + Arrays.sort(bytes); + return new String(bytes); + } + else + return x; + } + + public String pileupDiff(final ReadBackedPileup a, final SAMPileupFeature b, boolean orderDependent) + { + if ( a.getNumberOfElements() != b.size() ) + return "Sizes not equal"; + GenomeLoc featureLocation = getToolkit().getGenomeLocParser().createGenomeLoc(b.getChr(),b.getStart(),b.getEnd()); + if ( a.getLocation().compareTo(featureLocation) != 0 ) + return "Locations not equal"; + + String aBases = maybeSorted(new String(a.getBases()), ! orderDependent ); + String bBases = maybeSorted(b.getBasesAsString(), ! orderDependent ); + if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) ) + return "Bases not equal"; + + String aQuals = maybeSorted(new String(a.getQuals()), ! orderDependent ); + String bQuals = maybeSorted(new String(b.getQuals()), ! orderDependent ); + if ( ! aQuals.equals(bQuals) ) + return "Quals not equal"; + + return null; + } + + // Given result of map function + public CheckPileupStats reduceInit() { return new CheckPileupStats(); } + public CheckPileupStats reduce(Integer value, CheckPileupStats sum) { + sum.nLoci++; + sum.nBases += value; + return sum; + } + + public CheckPileupStats treeReduce( CheckPileupStats lhs, CheckPileupStats rhs ) { + CheckPileupStats combined = new CheckPileupStats(); + combined.nLoci = lhs.nLoci + rhs.nLoci; + combined.nBases = lhs.nBases + rhs.nBases; + return combined; + } + + /** + * Extracts the true pileup data from the given rodSAMPileup. Note that this implementation + * assumes that the genotype will only be point or indel. + * @param tracker ROD tracker from which to extract pileup data. + * @return True pileup data. + */ + private SAMPileupFeature getTruePileup( RefMetaDataTracker tracker ) { + SAMPileupFeature pileupArg = tracker.getFirstValue(pileup); + + if( pileupArg == null) + return null; + + if( pileupArg.hasPointGenotype() ) + return pileupArg.getPointGenotype(); + else if( pileupArg.hasIndelGenotype() ) + return pileupArg.getIndelGenotype(); + else + throw new ReviewedStingException("Unsupported pileup type: " + pileupArg); + } +} + +class CheckPileupStats { + public long nLoci = 0; + public long nBases = 0; + + public CheckPileupStats() { + } + + public String toString() { + return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java new file mode 100644 index 000000000..8e99c1828 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -0,0 +1,111 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; + +/** + * A walker that simply throws errors. Allows us to test that the engine is behaving as expected with error handling + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_TOY, extraDocs = {CommandLineGATK.class} ) +public class ErrorThrowing extends RefWalker implements TreeReducible, NanoSchedulable { + @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) + public String exceptionToThrow; + + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE + } + + // + // Template code to allow us to build the walker, doesn't actually do anything + // + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( ref == null ) // only throw exception when we are in proper map, not special map(null) call + return null; + + if ( failMethod == FailMethod.MAP ) + fail(); + + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + if ( value != null && failMethod == FailMethod.REDUCE ) + fail(); + return sum; + } + + public Integer treeReduce(final Integer lhs, final Integer rhs) { + if ( failMethod == FailMethod.TREE_REDUCE ) + fail(); + return rhs; + } + + private void fail() { + if ( exceptionToThrow.equals("UserException") ) { + throw new UserException("UserException"); + } else if ( exceptionToThrow.equals("NullPointerException") ) { + throw new NullPointerException(); + } else if ( exceptionToThrow.equals("ReviewedStingException") ) { + throw new ReviewedStingException("ReviewedStingException"); + } else if ( exceptionToThrow.equals("SamError1") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + } else if ( exceptionToThrow.equals("SamError2") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + } else if ( exceptionToThrow.equals("NoSpace1") ) { + throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else if ( exceptionToThrow.equals("NoSpace2") ) { + throw new net.sf.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else { + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java new file mode 100644 index 000000000..48e21fdd0 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java @@ -0,0 +1,217 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Emulates the samtools pileup command to print aligned reads + * + *

Prints the alignment in something similar to the Samtools pileup format (see the + * Pileup format documentation for more details about + * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference + * base, read bases, and read qualities. In addition to these default fields, additional information can be added to + * the output as extra columns; see options detailed below.

+ * + *

Emulated command:

+ *
+ *  samtools pileup -f in.ref.fasta -l in.site_list input.bam
+ * 
+ + * + *

Input

+ *

+ * A BAM file and the interval to print. + *

+ * + *

Output

+ *

+ * Alignment of reads formatted in the Pileup style. + *

+ * + *

Example

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -T Pileup \
+ *   -R exampleFASTA.fasta \
+ *   -I exampleBAM.bam \
+ *   -L chr1:257-267
+ *   -o output.txt
+ * 
+ *

Expected output

+ *
+ *     chr1 257 A CAA '&=
+ *     chr1 258 C TCC A:=
+ *     chr1 259 C CCC )A=
+ *     chr1 260 C ACC (=<
+ *     chr1 261 T TCT '44
+ *     chr1 262 A AAA '?:
+ *     chr1 263 A AGA 1'6
+ *     chr1 264 C TCC 987
+ *     chr1 265 C CCC (@(
+ *     chr1 266 C GCC ''=
+ *     chr1 267 T AAT 7%>
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { + + private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names + + @Output + PrintStream out; + + /** + * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions, + * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per + * read items are delimited with an '@' character. + */ + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) + public boolean SHOW_VERBOSE = false; + /** + * This enables annotating the pileup to show overlaps with metadata from a ROD file. + * For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup + * output at that position will be annotated with the corresponding source ROD identifier. + */ + @Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false) + public List> rods = Collections.emptyList(); + /** + * Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert + * produced during library generation before sequencing. + */ + @Hidden + @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false) + public boolean outputInsertLength=false; + + @Override + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String rods = getReferenceOrderedData( tracker ); + + ReadBackedPileup basePileup = context.getBasePileup(); + + final StringBuilder s = new StringBuilder(); + s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); + if ( outputInsertLength ) + s.append(" ").append(insertLengthOutput(basePileup)); + if ( SHOW_VERBOSE ) + s.append(" ").append(createVerboseOutput(basePileup)); + s.append("\n"); + + return s.toString(); + } + + // Given result of map function + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(String value, Integer sum) { + out.print(value); + return sum + 1; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + /** + * Get a string representation the reference-ordered data. + * @param tracker Container for the reference-ordered data. + * @return String representation of the reference-ordered data. + */ + private String getReferenceOrderedData( RefMetaDataTracker tracker ) { + ArrayList rodStrings = new ArrayList(); + for ( Feature datum : tracker.getValues(rods) ) { + rodStrings.add(datum.toString()); + } + String rodString = Utils.join(", ", rodStrings); + + if ( !rodString.equals("") ) + rodString = "[ROD: " + rodString + "]"; + + return rodString; + } + private static String insertLengthOutput(final ReadBackedPileup pileup) { + + Integer[] insertSizes=new Integer[pileup.depthOfCoverage()]; + + int i=0; + for ( PileupElement p : pileup ) { + insertSizes[i]=p.getRead().getInferredInsertSize(); + ++i; + } + return Utils.join(",",insertSizes); + } + + + private static String createVerboseOutput(final ReadBackedPileup pileup) { + final StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + + sb.append(pileup.getNumberOfDeletions()); + sb.append(" "); + + for ( PileupElement p : pileup ) { + if ( isFirst ) + isFirst = false; + else + sb.append(","); + sb.append(p.getRead().getReadName()); + sb.append(verboseDelimiter); + sb.append(p.getOffset()); + sb.append(verboseDelimiter); + sb.append(p.getRead().getReadLength()); + sb.append(verboseDelimiter); + sb.append(p.getRead().getMappingQuality()); + } + return sb.toString(); + } + + @Override + public void onTraversalDone(Integer result) { + out.println("[REDUCE RESULT] Traversal result is: " + result); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java new file mode 100644 index 000000000..59b95f2ba --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java @@ -0,0 +1,157 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.util.Arrays; + +/** + * Read clipping statistics for all reads. + * + * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length + * of the clipping to the output stream. + * + * Note: Ignores N's in the Cigar string. + * + *

Input

+ * One or more BAM files + * + *

Output

+ * A simple tabulated text file with read length and clipping statistics for every read (or every N reads if the "skip" + * option is used) + * + * User: depristo + * Date: May 5, 2010 + * Time: 12:16:41 PM + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +@Requires({DataSource.READS}) +public class ReadClippingStats extends ReadWalker { + @Output + protected PrintStream out; + + /** + * when this flag is set (default), statistics will be collected on unmapped reads as well. The default behavior + * is to ignore unmapped reads." + */ + @Argument(fullName="include_unmapped", shortName="u", doc="Include unmapped reads in the analysis", required=false) + protected boolean INCLUDE_UNMAPPED = false; + + /** + * print every read whose read number is divisible by SKIP. READ_NUMBER % SKIP == 0. First read in the file has read number = 1, + * second is 2, third is 3, ... A value of 1 means print every read. A value of 1000 means print every 1000th read. + */ + @Advanced + @Argument(fullName="skip", shortName="skip", doc="Do not print all reads, skip some.", required=false) + protected int SKIP = 1; + + public class ReadClippingInfo { + SAMReadGroupRecord rg; + int readLength, nClippingEvents, nClippedBases; + } + + public ReadClippingInfo map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( AlignmentUtils.isReadUnmapped(read) && !INCLUDE_UNMAPPED) + return null; + + ReadClippingInfo info = new ReadClippingInfo(); + info.rg = read.getReadGroup(); + + if ( info.rg == null ) throw new UserException.ReadMissingReadGroup(read); + + for ( CigarElement elt : read.getCigar().getCigarElements() ) { + switch ( elt.getOperator()) { + case H : // ignore hard clips + case S : // soft clip + info.nClippingEvents++; + info.nClippedBases += elt.getLength(); + break; + case M : + case D : // deletion w.r.t. the reference + case P : // ignore pads + case I : // insertion w.r.t. the reference + case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + elt.getOperator()); + } + info.readLength = read.getReadLength(); + } + + return info; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + out.println(Utils.join(" \t", Arrays.asList("ReadGroup", "ReadLength", "NClippingEvents", "NClippedBases", "PercentClipped"))); + return 0; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param info result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(ReadClippingInfo info, Integer sum) { + if ( info != null ) { + if ( sum % SKIP == 0 ) { + String id = info.rg.getReadGroupId(); + out.printf("%s\t %d\t %d\t %d\t %.2f%n", + id, info.readLength, info.nClippingEvents, info.nClippedBases, + 100.0 * MathUtils.ratio(info.nClippedBases, info.readLength)); + } + return sum + 1; + } else { + return sum; + } + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java new file mode 100644 index 000000000..152128022 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -0,0 +1,357 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.variant.variantcontext.writer.Options; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; + +import java.util.*; + +/** + * Combines VCF records from different sources. + * + *

+ * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data + * and any number of sources can be input. This tool currently supports two different combination types for each of + * variants (the first 8 fields of the VCF) and genotypes (the rest). + * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. + * Union: assumes each rod represents the same set of samples (although this is not enforced); using the + * priority list (if provided), it emits a single record instance at every position represented in the rods. + * + * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD + * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, + * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs + * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just + * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single + * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out + * in the detailed example in the documentation guide. + * + * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful + * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time + * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together + * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing + * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. + * + * Some fine details about the merging algorithm: + *

    + *
  • As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of + * the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the + * max QUAL, which resulted in sometime strange downstream confusion
  • + *
+ * + *

Input

+ *

+ * One or more variant sets to combine. + *

+ * + *

Output

+ *

+ * A combined VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineVariants \
+ *   --variant input1.vcf \
+ *   --variant input2.vcf \
+ *   -o output.vcf \
+ *   -genotypeMergeOptions UNIQUIFY
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineVariants \
+ *   --variant:foo input1.vcf \
+ *   --variant:bar input2.vcf \
+ *   -o output.vcf \
+ *   -genotypeMergeOptions PRIORITIZE
+ *   -priority foo,bar
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-50,stop=50)) +public class CombineVariants extends RodWalker implements TreeReducible { + /** + * The VCF files to merge together + * + * variants can take any number of arguments on the command line. Each -V argument + * will be included in the final merged output VCF. If no explicit name is provided, + * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc. + * The user can override this by providing an explicit name -V:name,vcf for each -V argument, + * and each named argument will be labeled as such in the output (i.e., set=name rather than + * set=variants2). The order of arguments does not matter unless except for the naming, so + * if you provide an rod priority list and no explicit names than variants, variants2, etc + * are technically order dependent. It is strongly recommended to provide explicit names when + * a rod priority list is provided. + */ + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variantCollections; + final private List> variants = new ArrayList<>(); + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) + public GATKVariantContextUtils.GenotypeMergeType genotypeMergeOption = null; + + @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) + public GATKVariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; + + @Hidden + @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) + public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE; + + /** + * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. + */ + @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) + public String PRIORITY_STRING = null; + + @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) + public boolean printComplexMerges = false; + + @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) + public boolean filteredAreUncalled = false; + + /** + * Used to generate a sites-only file. + */ + @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) + public boolean minimalVCF = false; + + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) + public boolean EXCLUDE_NON_VARIANTS = false; + + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) + public String SET_KEY = "set"; + + /** + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. + */ + @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) + public boolean ASSUME_IDENTICAL_SAMPLES = false; + + @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) + public int minimumN = 1; + + /** + * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. + */ + @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) + public boolean SUPPRESS_COMMAND_LINE_HEADER = false; + + @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) + public boolean MERGE_INFO_WITH_MAX_AC = false; + + private List priority = null; + + /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ + private boolean sitesOnlyVCF = false; + private Set samples; + + public void initialize() { + Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + + if ( vcfWriter instanceof VariantContextWriterStub) { + sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); + if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); + } else + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); + + validateAnnotateUnionArguments(); + if ( PRIORITY_STRING == null && genotypeMergeOption == null) { + genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED; + //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) + logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority); + } + + if (genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && + !SampleUtils.verifyUniqueSamplesNames(vcfRods)) + throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered."); + + samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); + + if ( SET_KEY.toLowerCase().equals("null") ) + SET_KEY = null; + + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + if ( SET_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); + if ( !ASSUME_IDENTICAL_SAMPLES ) + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); + vcfWriter.writeHeader(vcfHeader); + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection variantCollection : variantCollections ) + variants.addAll(variantCollection.getRodBindings()); + } + + private void validateAnnotateUnionArguments() { + Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + + if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) + throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); + + if ( PRIORITY_STRING != null){ + priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); + if ( rodNames.size() != priority.size() ) + throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); + + if ( ! rodNames.containsAll(priority) ) + throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); + } + + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return 0; + + final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + // get all of the vcf rods at this locus + // Need to provide reference bases to simpleMerge starting at current locus + Collection vcs = tracker.getValues(variants, context.getLocation()); + + if ( sitesOnlyVCF ) { + vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); + } + + if ( ASSUME_IDENTICAL_SAMPLES ) { + for ( final VariantContext vc : vcs ) { + vcfWriter.add(vc); + } + + return vcs.isEmpty() ? 0 : 1; + } + + int numFilteredRecords = 0; + for (final VariantContext vc : vcs) { + if (vc.filtersWereApplied() && vc.isFiltered()) + numFilteredRecords++; + } + + if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) + return 0; + + final List mergedVCs = new ArrayList<>(); + + if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { + final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); + + // TODO -- clean this up in a refactoring + // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) + if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { + final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); + for ( final VariantContext.Type type : VariantContext.Type.values() ) { + if ( VCsByType.containsKey(type) ) { + VCsByType.get(type).addAll(refs); + break; + } + } + } + + // iterate over the types so that it's deterministic + for (final VariantContext.Type type : VariantContext.Type.values()) { + // make sure that it is a variant or in case it is not, that we want to include the sites with no variants + if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { + if (VCsByType.containsKey(type)) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(), + filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + } + } + } + else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType, + genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + else { + logger.warn("Ignoring all records at site " + ref.getLocus()); + } + + for ( final VariantContext mergedVC : mergedVCs ) { + // only operate at the start of events + if ( mergedVC == null ) + continue; + + final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); + // re-compute chromosome counts + VariantContextUtils.calculateChromosomeCounts(builder, false); + + if ( minimalVCF ) + GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); + final VariantContext vc = builder.make(); + if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) + vcfWriter.add(builder.make()); + } + + return vcs.isEmpty() ? 0 : 1; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return reduce(lhs, rhs); + } + + public void onTraversalDone(Integer sum) {} +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java new file mode 100755 index 000000000..395e24604 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java @@ -0,0 +1,366 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFHeader; + +import java.util.*; + +/** + * A class for tabulating and evaluating a callset-by-callset genotype concordance table + * */ +public class ConcordanceMetrics { + + private Map perSampleGenotypeConcordance; + private GenotypeConcordanceTable overallGenotypeConcordance; + private SiteConcordanceTable overallSiteConcordance; + private boolean printInterestingSites; + + public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth, boolean printSitesEnabled) { + HashSet overlappingSamples = new HashSet(evaluate.getGenotypeSamples()); + overlappingSamples.retainAll(truth.getGenotypeSamples()); + perSampleGenotypeConcordance = new HashMap(overlappingSamples.size()); + for ( String sample : overlappingSamples ) { + perSampleGenotypeConcordance.put(sample,new GenotypeConcordanceTable()); + } + overallGenotypeConcordance = new GenotypeConcordanceTable(); + overallSiteConcordance = new SiteConcordanceTable(); + printInterestingSites = printSitesEnabled; + } + + public GenotypeConcordanceTable getOverallGenotypeConcordance() { + return overallGenotypeConcordance; + } + + public SiteConcordanceTable getOverallSiteConcordance() { + return overallSiteConcordance; + } + + public GenotypeConcordanceTable getGenotypeConcordance(String sample) { + GenotypeConcordanceTable table = perSampleGenotypeConcordance.get(sample); + if ( table == null ) + throw new ReviewedStingException("Attempted to request the concordance table for sample "+sample+" on which it was not calculated"); + return table; + } + + public Map getPerSampleGenotypeConcordance() { + return Collections.unmodifiableMap(perSampleGenotypeConcordance); + } + + public Map getPerSampleNRD() { + Map nrd = new HashMap(perSampleGenotypeConcordance.size()); + for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { + nrd.put(sampleTable.getKey(),calculateNRD(sampleTable.getValue())); + } + + return Collections.unmodifiableMap(nrd); + } + + public Map getPerSampleOGC() { + Map ogc = new HashMap(perSampleGenotypeConcordance.size()); + for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { + ogc.put(sampleTable.getKey(),calculateOGC(sampleTable.getValue())); + } + + return Collections.unmodifiableMap(ogc); + } + + public Double getOverallNRD() { + return calculateNRD(overallGenotypeConcordance); + } + + public Double getOverallOGC() { + return calculateOGC(overallGenotypeConcordance); + } + + public Map getPerSampleNRS() { + Map nrs = new HashMap(perSampleGenotypeConcordance.size()); + for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { + nrs.put(sampleTable.getKey(),calculateNRS(sampleTable.getValue())); + } + + return Collections.unmodifiableMap(nrs); + } + + public Double getOverallNRS() { + return calculateNRS(overallGenotypeConcordance); + } + + @Requires({"eval != null","truth != null"}) + public void update(VariantContext eval, VariantContext truth) { + boolean doPrint = false; + overallSiteConcordance.update(eval,truth); + Set alleleTruth = new HashSet(8); + String truthRef = truth.getReference().getBaseString(); + alleleTruth.add(truthRef); + for ( Allele a : truth.getAlternateAlleles() ) { + alleleTruth.add(a.getBaseString()); + } + for ( String sample : perSampleGenotypeConcordance.keySet() ) { + Genotype evalGenotype = eval.getGenotype(sample); + Genotype truthGenotype = truth.getGenotype(sample); + // ensure genotypes are either no-call ("."), missing (empty alleles), or diploid + if ( ( ! evalGenotype.isNoCall() && evalGenotype.getPloidy() != 2 && evalGenotype.getPloidy() > 0) || + ( ! truthGenotype.isNoCall() && truthGenotype.getPloidy() != 2 && truthGenotype.getPloidy() > 0) ) { + throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy())); + } + perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef); + doPrint = overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); + if(printInterestingSites && doPrint) + System.out.println(eval.getChr() + ":" + eval.getStart() + "\t truth is:" + truthGenotype.getType() + "\t eval is:" + evalGenotype.getType()); + + //Below is code to print out mismatched alternate alleles + //System.out.println(eval.getChr() + ":" + eval.getStart() + "\t truth is:" + truthGenotype.getAlleles() + "\t eval is:" + evalGenotype.getAlleles()); + } + } + + private static double calculateNRD(GenotypeConcordanceTable table) { + return calculateNRD(table.getTable()); + } + + private static double calculateNRD(int[][] concordanceCounts) { + int correct = 0; + int total = 0; + correct += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HET.ordinal()]; + correct += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += correct; + total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HET.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_REF.ordinal()]; + total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_REF.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HET.ordinal()]; + // NRD is by definition incorrec/total = 1.0-correct/total + // note: if there are no observations (so the ratio is NaN), set this to 100% + return total == 0 ? 1.0 : 1.0 - ( (double) correct)/( (double) total); + } + + private static double calculateOGC(int[][] concordanceCounts) { + int correct = 0; + int total = 0; + correct += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_REF.ordinal()]; + correct += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HET.ordinal()]; + correct += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += correct; + total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HET.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_REF.ordinal()]; + total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_REF.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HET.ordinal()]; + // OGC is by definition correct/total + // note: if there are no observations (so the ratio is NaN), set this to 100% + return total == 0 ? 1.0 : ( (double) correct)/( (double) total); + } + + private static double calculateNRS(GenotypeConcordanceTable table) { + return calculateNRS(table.getTable()); + } + + private static double calculateOGC(GenotypeConcordanceTable table) { + return calculateOGC(table.getTable()); + } + + private static double calculateNRS(int[][] concordanceCounts) { + long confirmedVariant = 0; + long unconfirmedVariant = 0; + for ( GenotypeType truthState : Arrays.asList(GenotypeType.HET,GenotypeType.HOM_VAR) ) { + for ( GenotypeType evalState : GenotypeType.values() ) { + if ( evalState == GenotypeType.MIXED ) + continue; + if ( evalState.equals(GenotypeType.HET) || evalState.equals(GenotypeType.HOM_VAR) ) + confirmedVariant += concordanceCounts[evalState.ordinal()][truthState.ordinal()]; + else + unconfirmedVariant += concordanceCounts[evalState.ordinal()][truthState.ordinal()]; + } + } + + long total = confirmedVariant + unconfirmedVariant; + // note: if there are no observations (so the ratio is NaN) set this to 0% + return total == 0l ? 0.0 : ( (double) confirmedVariant ) / ( (double) ( total ) ); + } + + + class GenotypeConcordanceTable { + + private int[][] genotypeCounts; + private int nMismatchingAlt; + + public GenotypeConcordanceTable() { + genotypeCounts = new int[GenotypeType.values().length][GenotypeType.values().length]; + nMismatchingAlt = 0; + } + + @Requires({"eval!=null","truth != null","truthAlleles != null"}) + public Boolean update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) { + // this is slow but correct. + + // NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles + // that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will + // all match, so long as A and T are alleles in the truth callset. + boolean matchingAlt = true; + int evalGT, truthGT; + if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) { + // by default, no-calls "match" between alleles, so if + // one or both sites are no-call or unavailable, the alt alleles match + // otherwise, check explicitly: if the eval has an allele that's not ref, no-call, or present in truth + // the alt allele is mismatching - regardless of whether the genotype is correct. + for ( Allele evalAllele : eval.getAlleles() ) { + matchingAlt &= truthAlleles.contains(evalAllele.getBaseString()); + } + } else if ( eval.isCalled() && truth.isCalled() ) { + // otherwise, the eval genotype has to match either the alleles in the truth genotype, or the truth reference allele + // todo -- this can be sped up by caching the truth allele sets + Set genoAlleles = new HashSet(3); + genoAlleles.add(truthRef); + for ( Allele truthGenoAl : truth.getAlleles() ) { + genoAlleles.add(truthGenoAl.getBaseString()); + } + for ( Allele evalAllele : eval.getAlleles() ) { + matchingAlt &= genoAlleles.contains(evalAllele.getBaseString()); + } + } + + if ( matchingAlt ) { + evalGT = eval.getType().ordinal(); + truthGT = truth.getType().ordinal(); + genotypeCounts[evalGT][truthGT]++; + if(evalGT != truthGT) //report variants where genotypes don't match + return true; + } else { + nMismatchingAlt++; + return false; + //return true; //alternatively, report variants where alt alleles don't match + } + return false; + } + + public int[][] getTable() { + return genotypeCounts; + } + + public int getnMismatchingAlt() { + return nMismatchingAlt; + } + + public int getnEvalGenotypes(GenotypeType type) { + int nGeno = 0; + for ( GenotypeType comptype : GenotypeType.values() ) + nGeno += genotypeCounts[type.ordinal()][comptype.ordinal()]; + return nGeno; + } + + public int getnCalledEvalGenotypes() { + int nGeno = 0; + for ( GenotypeType evalType : Arrays.asList(GenotypeType.HOM_REF,GenotypeType.HOM_VAR,GenotypeType.HET) ) { + nGeno += getnEvalGenotypes(evalType); + } + + return nGeno + nMismatchingAlt; + } + + public int getnCompGenotypes(GenotypeType type) { + int nGeno = 0; + for ( GenotypeType evaltype : GenotypeType.values() ) + nGeno += genotypeCounts[evaltype.ordinal()][type.ordinal()]; + return nGeno; + } + + public int getnCalledCompGenotypes() { + int nGeno = 0; + for ( GenotypeType compType : Arrays.asList(GenotypeType.HOM_REF,GenotypeType.HOM_VAR,GenotypeType.HET) ) { + nGeno += getnCompGenotypes(compType); + } + return nGeno; + } + + public int get(GenotypeType evalType, GenotypeType compType) { + return genotypeCounts[evalType.ordinal()][compType.ordinal()]; + } + } + + class SiteConcordanceTable { + + private int[] siteConcordance; + + public SiteConcordanceTable() { + siteConcordance = new int[SiteConcordanceType.values().length]; + } + + public void update(VariantContext evalVC, VariantContext truthVC) { + SiteConcordanceType matchType = getMatchType(evalVC,truthVC); + siteConcordance[matchType.ordinal()]++; + } + + @Requires({"evalVC != null","truthVC != null"}) + private SiteConcordanceType getMatchType(VariantContext evalVC, VariantContext truthVC) { + return SiteConcordanceType.getConcordanceType(evalVC,truthVC); + } + + public int[] getSiteConcordance() { + return siteConcordance; + } + + public int get(SiteConcordanceType type) { + return getSiteConcordance()[type.ordinal()]; + } + } + + enum SiteConcordanceType { + ALLELES_MATCH, + EVAL_SUPERSET_TRUTH, + EVAL_SUBSET_TRUTH, + ALLELES_DO_NOT_MATCH, + EVAL_ONLY, + TRUTH_ONLY; + + public static SiteConcordanceType getConcordanceType(VariantContext eval, VariantContext truth) { + if ( eval.isMonomorphicInSamples() ) + return TRUTH_ONLY; + if ( truth.isMonomorphicInSamples() ) + return EVAL_ONLY; + + boolean evalSubsetTruth = GATKVariantContextUtils.allelesAreSubset(eval, truth); + boolean truthSubsetEval = GATKVariantContextUtils.allelesAreSubset(truth, eval); + + if ( evalSubsetTruth && truthSubsetEval ) + return ALLELES_MATCH; + + if ( evalSubsetTruth ) + return EVAL_SUBSET_TRUTH; + + if ( truthSubsetEval ) + return EVAL_SUPERSET_TRUTH; + + return ALLELES_DO_NOT_MATCH; + } + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java new file mode 100644 index 000000000..2b18eda20 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -0,0 +1,134 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +/** + * Filters a lifted-over VCF file for ref bases that have been changed. + * + * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. + * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. + * + * FilteredLiftedVariants is intended to be the second of two processing steps for the liftover process. The first step is to run LiftoverVariants on your VCF file. + * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=0,stop=100)) +public class FilterLiftedVariants extends RodWalker { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + private static final int MAX_VARIANT_SIZE = 100; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter writer = null; + + private long failedLocs = 0, totalLocs = 0; + + public void initialize() { + String trackName = variantCollection.variants.getName(); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + + final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaDataInSortedOrder() : Collections.emptySet(), samples); + writer.writeHeader(vcfHeader); + } + + /** + * Determines whether records should be filtered; if not, writes them to the output + * + * @param ref the reference context + * @param vc the VariantContext to process + * @return true if the record is not filtered, false otherwise + */ + protected boolean filterOrWrite(final byte[] ref, final VariantContext vc) { + if ( ref == null ) throw new IllegalArgumentException("Cannot filter based on a null reference array"); + if ( vc == null ) throw new IllegalArgumentException("Cannot filter a null Variant Context"); + + totalLocs++; + + boolean filter = false; + final byte[] recordRef = vc.getReference().getBases(); + + // this can happen for records that get placed at the ends of chromosomes + if ( recordRef.length > ref.length ) { + filter = true; + } else { + for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { + if ( recordRef[i] != ref[i] ) { + filter = true; + break; + } + } + } + + if ( filter ) + failedLocs++; + else + writer.add(vc); + + return !filter; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + for ( final VariantContext vc : VCs ) + filterOrWrite(ref.getBases(), vc); + + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { return 0; } + + public void onTraversalDone(Integer result) { + System.out.println("Filtered " + failedLocs + " records out of " + totalLocs + " total records."); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java new file mode 100755 index 000000000..08c938583 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -0,0 +1,642 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFHeader; + +import java.io.PrintStream; +import java.util.*; + +/** + * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets + * + *

+ * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, + * and for each sample, the genotype-by-genotype counts (e.g. the number of sites at which a sample was + * called homozygous-reference in the EVAL callset, but homozygous-variant in the COMP callset). It outputs these + * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in + * the COMP) and metrics (such as NRD and NRS). + *

+ * + *

Input

+ *

+ * Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via + * the -eval and -comp arguments. Typically, the EVAL callset is an experimental set you want to evaluate, while the + * COMP callset is a previously existing set used as a standard for comparison (taken to represent "truth"). + *

+ *

+ * (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and + * -cfe arguments, respectively. + *

+ * + *

Output

+ *

+ * Genotype Concordance writes a GATK report to the specified file (via -o), consisting of multiple tables of counts + * and proportions. These tables are constructed on a per-sample basis, and include counts of EVAL vs COMP genotype states, and the + * number of times the alternate alleles between the EVAL and COMP sample did not match up. + *

+ * + *

Term and metrics definitions

+ *

+ *

    + *
  • HET: heterozygous
  • + *
  • HOM_REF: homozygous reference
  • + *
  • HOM_VAR: homozygous variant
  • + *
  • MIXED: something like ./1
  • + *
  • ALLELES_MATCH: counts of calls at the same site where the alleles match
  • + *
  • ALLELES_DO_NOT_MATCH: counts of calls at the same location with different alleles, such as the eval set calling a 'G' alternate allele, and the comp set calling a 'T' alternate allele
  • + *
  • EVAL_ONLY: counts of sites present only in the EVAL set, not in the COMP set
  • + *
  • TRUTH_ONLY: counts of sites present only in the COMP set, not in the EVAL set
  • + *
  • Non-Reference_Discrepancy (NRD): genotype concordance excluding concordant reference sites
  • + *
  • Non-Reference_Sensitivity (NRS): sensitivity of the EVAL calls to polymorphic calls in the COMP set, calculated by (# true positive)/(# true polymorphic)
  • + *
  • Overall_Genotype_Concordance: overall concordance calculated by (# concordant genotypes)/(# genotypes)
  • + *
+ *

+ * + *

Moltenized tables

+ * + *

These tables may be optionally moltenized via the -moltenize argument. That is, the standard table + * + *

+ *  Sample   NO_CALL_HOM_REF  NO_CALL_HET  NO_CALL_HOM_VAR   (...)
+ *  NA12878       0.003        0.001            0.000        (...)
+ *  NA12891       0.005        0.000            0.000        (...)
+ *  
+ * + * would instead be displayed + * + *
+ *  NA12878  NO_CALL_HOM_REF   0.003
+ *  NA12878  NO_CALL_HET       0.001
+ *  NA12878  NO_CALL_HOM_VAR   0.000
+ *  NA12891  NO_CALL_HOM_REF   0.005
+ *  NA12891  NO_CALL_HET       0.000
+ *  NA12891  NO_CALL_HOM_VAR   0.000
+ *  (...)
+ *  
+ * + *

Site-level allelic concordance

+ * + *

+ * For strictly bi-allelic VCFs, only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated, + * but where multi-allelic sites are involved counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated. + *

+ *

+ * For example, in the following situation + *

+ *    eval:  ref - A   alt - C
+ *    comp:  ref - A   alt - C,T
+ *  
+ * then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH. + * However, in the case where EVAL has both C and T alternate alleles, both must be observed in the genotypes + * (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If + * one of the alleles has no observations in the genotype fields of the EVAL, the site-level concordance is + * tabulated as though that allele were not present in the record. + *

+ * + *

Monomorphic Records

+ *

+ * A site which has an alternate allele, but which is monomorphic in samples, is treated as not having been + * discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the COMP set), or not at all + * (if no record exists in the COMP set). + *

+ *

+ * That is, in the situation + *

+ *   eval:  ref - A   alt - C   genotypes - 0/0  0/0  0/0 ... 0/0
+ *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
+ *  
+ * is equivalent to + *
+ *   eval:  ref - A   alt - .   genotypes - 0/0  0/0  0/0 ... 0/0
+ *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
+ *  
+ *

+ *

+ * When a record is present in the COMP set the *genotypes* for the monomorphic site will still be used to evaluate + * per-sample genotype concordance counts. + *

+ * + *

Filtered Records

+ * Filtered records are treated as though they were not present in the VCF, unless -ignoreSiteFilters is provided, + * in which case all records are used. There is currently no way to assess concordance metrics on filtered sites + * exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them. + * + + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { + + /** + * The callset you want to evaluate, typically this is where you'd put 'unassessed' callsets. + */ + @Input(fullName="eval",shortName="eval",doc="The variants and genotypes to evaluate",required=true) + RodBinding evalBinding; + + /** + * The callset you want to treat as 'truth'. Can also be of unknown quality for the sake of callset comparisons. + */ + @Input(fullName="comp",shortName="comp",doc="The variants and genotypes to compare against",required=true) + RodBinding compBinding; + + /** + * The FILTER field of the eval and comp VCFs will be ignored. If this flag is not included, all FILTER sites will + * be treated as not being present in the VCF. (That is, the genotypes will be assigned UNAVAILABLE, as distinct + * from NO_CALL). + */ + @Argument(fullName="ignoreFilters",doc="Filters will be ignored",required=false) + boolean ignoreFilters = false; + + /** + * A genotype level JEXL expression to apply to eval genotypes. Genotypes filtered in this way will be replaced by NO_CALL. + * For instance: -gfe 'GQ<20' will set to no-call any genotype with genotype quality less than 20. + */ + @Argument(shortName="gfe", fullName="genotypeFilterExpressionEval", doc="One or more criteria to use to set EVAL genotypes to no-call. "+ + "These genotype-level filters are only applied to the EVAL rod.", required=false) + public ArrayList genotypeFilterExpressionsEval = new ArrayList(); + + /** + * Identical to -gfe except the filter is applied to genotypes in the comp rod. + */ + @Argument(shortName="gfc", fullName="genotypeFilterExpressionComp", doc="One or more criteria to use to set COMP genotypes to no-call. "+ + "These genotype-level filters are only applied to the COMP rod.", required=false) + public ArrayList genotypeFilterExpressionsComp = new ArrayList(); + + /** + * Moltenize the count and proportion tables. Rather than moltenizing per-sample data into a 2x2 table, it is fully + * moltenized into elements. That is, WITHOUT this argument, each row of the table begins with the sample name and + * proceeds directly with counts/proportions of eval/comp counts (for instance HOM_REF/HOM_REF, HOM_REF/NO_CALL). + * + * If the Moltenize argument is given, the output will begin with a sample name, followed by the contrastive genotype + * type (such as HOM_REF/HOM_REF), followed by the count or proportion. This will significantly increase the number of + * rows. + */ + @Argument(shortName="moltenize",fullName="moltenize",doc="Molten rather than tabular output") + public boolean moltenize = false; + + /** + * Print sites where genotypes are mismatched between callsets along with annotations giving the genotype of each callset + * Outputs directly to System.out. Super classy. + * + * NOTE: doesn't currently differentiate between samples, so there may be repeats + */ + @Hidden + @Argument(shortName="sites", fullName = "printInterestingSites", required=false) + protected boolean printSites = false; + + @Output + PrintStream out; + + private List evalSamples; + private List compSamples; + private List evalJexls = null; + private List compJexls = null; + + // todo -- table with "proportion of overlapping sites" (not just eval/comp margins) [e.g. drop no-calls] + // (this will break all the integration tests of course, due to new formatting) + + public void initialize() { + evalJexls = initializeJexl(genotypeFilterExpressionsEval); + compJexls = initializeJexl(genotypeFilterExpressionsComp); + } + + private List initializeJexl(ArrayList genotypeFilterExpressions) { + ArrayList dummyNames = new ArrayList(genotypeFilterExpressions.size()); + int expCount = 1; + for ( String exp : genotypeFilterExpressions ) { + dummyNames.add(String.format("gfe%d",expCount++)); + } + return VariantContextUtils.initializeMatchExps(dummyNames, genotypeFilterExpressions); + } + + public ConcordanceMetrics reduceInit() { + Map headerMap = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(evalBinding,compBinding)); + VCFHeader evalHeader = headerMap.get(evalBinding.getName()); + evalSamples = evalHeader.getGenotypeSamples(); + VCFHeader compHeader = headerMap.get(compBinding.getName()); + compSamples = compHeader.getGenotypeSamples(); + return new ConcordanceMetrics(evalHeader,compHeader, printSites); + } + + + public List> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + List> evalCompPair = new ArrayList>(3); + if ( tracker != null && ( + tracker.getValues(evalBinding,ref.getLocus()).size() > 0 || + tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) { + + List eval = tracker.getValues(evalBinding,ref.getLocus()); + List comp = tracker.getValues(compBinding,ref.getLocus()); + if ( eval.size() > 1 || comp.size() > 1 ) { + if ( noDuplicateTypes(eval) && noDuplicateTypes(comp) ) { + logger.info("Eval or Comp Rod at position " + ref.getLocus().toString() + " has multiple records. Resolving."); + evalCompPair = resolveMultipleRecords(eval,comp); + } else { + logger.warn("Eval or Comp Rod at position "+ref.getLocus().toString()+" has multiple records of the same type. This locus will be skipped."); + } + } else { + // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct. + // note that if there is no eval rod there must be a comp rod, and also the reverse + VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(comp.get(0),evalSamples); + VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(eval.get(0),compSamples); + evalContext = filterGenotypes(evalContext,ignoreFilters,evalJexls); + compContext = filterGenotypes(compContext,ignoreFilters,compJexls); + evalCompPair.add(new Pair(evalContext,compContext)); + } + } + + return evalCompPair; + } + + private boolean noDuplicateTypes(List vcList) { + HashSet types = new HashSet(vcList.size()); + for ( VariantContext vc : vcList ) { + VariantContext.Type type = vc.getType(); + if ( types.contains(type) ) + return false; + types.add(type); + } + + return true; + } + + /** + * The point of this method is to match up pairs of evals and comps by their type (or alternate alleles for mixed). + * Basically multiple records could exist for a site such as: + * Eval: 20 4000 A C + * Eval: 20 4000 A AC + * Comp: 20 4000 A C + * So for each eval, loop through the comps. If the types match, or for mixed types if eval alleles (non-emptily) + * intersect the comp alleles, pair them up and remove that comp records. + * Continue until we're out of evals or comps. This is n^2, but should rarely actually happen. + * + * The remaining unpaired records get paird with an empty contexts. So in the example above we'd get a list of: + * 1 - (20,4000,A/C | 20,4000,A/C) + * 2 - (20,4000,A/AC | Empty ) + * @param evalList - list of eval variant contexts + * @param compList - list of comp variant contexts + * @return resolved pairs of the input lists + */ + private List> resolveMultipleRecords(List evalList, List compList) { + List> resolvedPairs = new ArrayList>(evalList.size()+compList.size()); // oversized but w/e + List pairedEval = new ArrayList(evalList.size()); + for ( VariantContext eval : evalList ) { + VariantContext.Type evalType = eval.getType(); + Set evalAlleles = new HashSet(eval.getAlternateAlleles()); + VariantContext pairedComp = null; + for ( VariantContext comp : compList ) { + if ( evalType.equals(comp.getType()) ) { + pairedComp = comp; + break; + } else if ( eval.isMixed() || comp.isMixed() ) { + for ( Allele compAllele : comp.getAlternateAlleles() ) { + if ( evalAlleles.contains(compAllele) ) { + pairedComp = comp; + break; + } + } + } + } + if ( pairedComp != null ) { + compList.remove(pairedComp); + resolvedPairs.add(new Pair(filterGenotypes(eval,ignoreFilters,evalJexls),filterGenotypes(pairedComp,ignoreFilters,compJexls))); + pairedEval.add(eval); + if ( compList.size() < 1 ) + break; + } + } + evalList.removeAll(pairedEval); + for ( VariantContext unpairedEval : evalList ) { + resolvedPairs.add(new Pair(filterGenotypes(unpairedEval,ignoreFilters,evalJexls),createEmptyContext(unpairedEval,compSamples))); + } + + for ( VariantContext unpairedComp : compList ) { + resolvedPairs.add(new Pair(createEmptyContext(unpairedComp,evalSamples),filterGenotypes(unpairedComp,ignoreFilters,compJexls))); + } + + return resolvedPairs; + } + + public ConcordanceMetrics reduce(List> evalCompList, ConcordanceMetrics metrics) { + for ( Pair evalComp : evalCompList){ + metrics.update(evalComp.getFirst(),evalComp.getSecond()); + + } + return metrics; + } + + private static double repairNaN(double d) { + if ( Double.isNaN(d) ) { + return 0.0; + } + return d; + } + + public void onTraversalDone(ConcordanceMetrics metrics) { + // todo -- this is over 200 lines of code just to format the output and could use some serious cleanup + GATKReport report = new GATKReport(); + GATKReportTable concordanceCounts = new GATKReportTable("GenotypeConcordance_Counts","Per-sample concordance tables: comparison counts",2+GenotypeType.values().length*GenotypeType.values().length); + GATKReportTable concordanceEvalProportions = new GATKReportTable("GenotypeConcordance_EvalProportions", "Per-sample concordance tables: proportions of genotypes called in eval",2+GenotypeType.values().length*GenotypeType.values().length); + GATKReportTable concordanceCompProportions = new GATKReportTable("GenotypeConcordance_CompProportions", "Per-sample concordance tables: proportions of genotypes called in comp",2+GenotypeType.values().length*GenotypeType.values().length); + GATKReportTable concordanceSummary = new GATKReportTable("GenotypeConcordance_Summary","Per-sample summary statistics: NRS, NRD, and OGC",2); + GATKReportTable siteConcordance = new GATKReportTable("SiteConcordance_Summary","Site-level summary statistics",ConcordanceMetrics.SiteConcordanceType.values().length); + if ( moltenize ) { + concordanceCompProportions.addColumn("Sample","%s"); + concordanceCounts.addColumn("Sample","%s"); + concordanceEvalProportions.addColumn("Sample","%s"); + concordanceSummary.addColumn("Sample","%s"); + + concordanceCompProportions.addColumn("Eval_Genotype","%s"); + concordanceCounts.addColumn("Eval_Genotype","%s"); + concordanceEvalProportions.addColumn("Eval_Genotype","%s"); + concordanceSummary.addColumn("Non-Reference_Discrepancy","%.3f"); + + concordanceCompProportions.addColumn("Comp_Genotype","%s"); + concordanceCounts.addColumn("Comp_Genotype","%s"); + concordanceEvalProportions.addColumn("Comp_Genotype","%s"); + concordanceSummary.addColumn("Non-Reference_Sensitivity","%.3f"); + + concordanceCompProportions.addColumn("Proportion","%.3f"); + concordanceCounts.addColumn("Count","%d"); + concordanceEvalProportions.addColumn("Proportion","%.3f"); + concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); + + for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { + ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String rowKey = String.format("%s_%s_%s",entry.getKey(),evalType.toString(),compType.toString()); + concordanceCounts.set(rowKey,"Sample",entry.getKey()); + concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); + int count = table.get(evalType, compType); + concordanceCounts.set(rowKey,"Count",count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { + concordanceEvalProportions.set(rowKey,"Sample",entry.getKey()); + concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + } + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { + concordanceCompProportions.set(rowKey,"Sample",entry.getKey()); + concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + } + String mismatchKey = String.format("%s_%s",entry.getKey(),"Mismatching"); + concordanceCounts.set(mismatchKey,"Sample",entry.getKey()); + concordanceCounts.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCounts.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(mismatchKey,"Sample",entry.getKey()); + concordanceEvalProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(mismatchKey,"Sample",entry.getKey()); + concordanceCompProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(mismatchKey,"Count",table.getnMismatchingAlt()); + } + + String sampleKey = "ALL"; + ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String rowKey = String.format("%s_%s_%s",sampleKey,evalType.toString(),compType.toString()); + concordanceCounts.set(rowKey,"Sample",sampleKey); + concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); + int count = table.get(evalType, compType); + concordanceCounts.set(rowKey,"Count",count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { + concordanceEvalProportions.set(rowKey,"Sample",sampleKey); + concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + } + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { + concordanceCompProportions.set(rowKey,"Sample",sampleKey); + concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + } + String rowKey = String.format("%s_%s",sampleKey,"Mismatching"); + concordanceCounts.set(rowKey,"Sample",sampleKey); + concordanceCounts.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCounts.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(rowKey,"Sample",sampleKey); + concordanceEvalProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(rowKey,"Sample",sampleKey); + concordanceCompProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(rowKey,"Count",table.getnMismatchingAlt()); + + for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { + concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); + concordanceSummary.set(nrsEntry.getKey(),"Non-Reference_Sensitivity",nrsEntry.getValue()); + } + for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { + concordanceSummary.set(nrdEntry.getKey(),"Non-Reference_Discrepancy",nrdEntry.getValue()); + } + for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { + concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); + } + concordanceSummary.set("ALL_NRS_NRD","Sample","ALL"); + concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Sensitivity",metrics.getOverallNRS()); + concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Discrepancy",metrics.getOverallNRD()); + concordanceSummary.set("ALL_NRS_NRD","Overall_Genotype_Concordance",metrics.getOverallOGC()); + + + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.addColumn(type.toString(),"%d"); + } + + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); + } + + } else { + concordanceCompProportions.addColumn("Sample","%s"); + concordanceCounts.addColumn("Sample","%s"); + concordanceEvalProportions.addColumn("Sample","%s"); + concordanceSummary.addColumn("Sample","%s"); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String colKey = String.format("%s_%s", evalType.toString(), compType.toString()); + concordanceCounts.addColumn(colKey,"%d"); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) + concordanceEvalProportions.addColumn(colKey,"%.3f"); + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) + concordanceCompProportions.addColumn(colKey,"%.3f"); + } + } + concordanceEvalProportions.addColumn("Mismatching_Alleles","%.3f"); + concordanceCompProportions.addColumn("Mismatching_Alleles","%.3f"); + concordanceCounts.addColumn("Mismatching_Alleles","%d"); + concordanceSummary.addColumn("Non-Reference Sensitivity","%.3f"); + concordanceSummary.addColumn("Non-Reference Discrepancy","%.3f"); + concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.addColumn(type.toString(),"%d"); + } + + for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { + ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); + concordanceEvalProportions.set(entry.getKey(),"Sample",entry.getKey()); + concordanceCompProportions.set(entry.getKey(),"Sample",entry.getKey()); + concordanceCounts.set(entry.getKey(),"Sample",entry.getKey()); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); + int count = table.get(evalType, compType); + concordanceCounts.set(entry.getKey(),colKey,count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) + concordanceEvalProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) + concordanceCompProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + concordanceEvalProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(entry.getKey(),"Mismatching_Alleles",table.getnMismatchingAlt()); + } + + String rowKey = "ALL"; + concordanceCompProportions.set(rowKey,"Sample",rowKey); + concordanceEvalProportions.set(rowKey,"Sample",rowKey); + concordanceCounts.set(rowKey,"Sample",rowKey); + ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); + int count = table.get(evalType,compType); + concordanceCounts.set(rowKey,colKey,count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) + concordanceEvalProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) + concordanceCompProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + concordanceEvalProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(rowKey,"Mismatching_Alleles",table.getnMismatchingAlt()); + + for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { + concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); + concordanceSummary.set(nrsEntry.getKey(),"Non-Reference Sensitivity",nrsEntry.getValue()); + } + for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { + concordanceSummary.set(nrdEntry.getKey(),"Non-Reference Discrepancy",nrdEntry.getValue()); + } + for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { + concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); + } + concordanceSummary.set("ALL","Sample","ALL"); + concordanceSummary.set("ALL","Non-Reference Sensitivity",metrics.getOverallNRS()); + concordanceSummary.set("ALL","Non-Reference Discrepancy",metrics.getOverallNRD()); + concordanceSummary.set("ALL","Overall_Genotype_Concordance",metrics.getOverallOGC()); + + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); + } + } + + report.addTable(concordanceCompProportions); + report.addTable(concordanceEvalProportions); + report.addTable(concordanceCounts); + report.addTable(concordanceSummary); + report.addTable(siteConcordance); + + report.print(out); + } + + public VariantContext createEmptyContext(VariantContext other, List samples) { + VariantContextBuilder builder = new VariantContextBuilder(); + // set the alleles to be the same + builder.alleles(other.getAlleles()); + builder.loc(other.getChr(),other.getStart(),other.getEnd()); + // set all genotypes to empty + List genotypes = new ArrayList(samples.size()); + for ( String sample : samples ) + genotypes.add(GenotypeBuilder.create(sample, new ArrayList(0))); + builder.genotypes(genotypes); + return builder.make(); + } + + public VariantContext filterGenotypes(VariantContext context, boolean ignoreSiteFilter, List exps) { + if ( ! context.isFiltered() || ignoreSiteFilter ) { + List filteredGenotypes = new ArrayList(context.getNSamples()); + for ( Genotype g : context.getGenotypes() ) { + Map matchMap = VariantContextUtils.match(context, g, exps); + boolean filtered = false; + for ( Boolean b : matchMap.values() ) { + if ( b ) { + filtered = true; + break; + } + } + if ( filtered ) { + filteredGenotypes.add(GenotypeBuilder.create(g.getSampleName(),Arrays.asList(Allele.NO_CALL,Allele.NO_CALL),g.getExtendedAttributes())); + } else { + filteredGenotypes.add(g); + } + } + VariantContextBuilder builder = new VariantContextBuilder(context); + builder.genotypes(filteredGenotypes); + return builder.make(); + } + + VariantContextBuilder builder = new VariantContextBuilder(); + builder.alleles(Arrays.asList(context.getReference())); + builder.loc(context.getChr(),context.getStart(),context.getEnd()); + List newGeno = new ArrayList(context.getNSamples()); + for ( Genotype g : context.getGenotypes().iterateInSampleNameOrder() ) { + newGeno.add(GenotypeBuilder.create(g.getSampleName(),new ArrayList())); + } + builder.genotypes(newGeno); + return builder.make(); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java new file mode 100644 index 000000000..5759abc41 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -0,0 +1,304 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.util.*; + +/** + * Left-aligns indels from a variants file. + * + *

+ * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to + * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. + * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. + * + *

Input

+ *

+ * A variant set to left-align and trim. + *

+ * + *

Output

+ *

+ * A left-aligned VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T LeftAlignAndTrimVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! +public class LeftAlignAndTrimVariants extends RodWalker { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. + */ + @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) + protected boolean trimAlleles = false; + + /** + * If this argument is set, split multiallelic records and left-align individual alleles. + * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. + */ + @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) + protected boolean splitMultiallelics = false; + + + @Output(doc="File to which variants should be written") + protected VariantContextWriter baseWriter = null; + + private VariantContextWriter writer; + + private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! + public void initialize() { + String trackName = variantCollection.variants.getName(); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + + Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); + baseWriter.writeHeader(new VCFHeader(headerLines, samples)); + + writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + + int changedSites = 0; + for ( final VariantContext vc : VCs ) { + // split first into biallelics, and optionally trim alleles to minimal representation + Pair result = new Pair(vc,0); // default value + if (splitMultiallelics) { + final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + for (final VariantContext biallelicVC: vcList) { + final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true) : biallelicVC); + result = alignAndWrite(v, ref); + + // strip out PLs and AD if we've subsetted the alleles + if ( vcList.size() > 1 ) + result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make(); + + writer.add(result.first); + changedSites += result.second; + } + } + else { + if (trimAlleles) + result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); + else + result = alignAndWrite(vc,ref); + writer.add(result.first); + changedSites += result.second; + + } + + } + + return changedSites; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + writer.close(); + System.out.println(result + " variants were aligned"); + } + + /** + * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be + * handled by calling routine. + * @param vc Input VC with variants to left align + * @param ref Reference context + * @return # of records left-aligned (0 or 1) and new VC. + */ + @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) + @Ensures({"result != null","result.first != null", "result.second >=0"}) + protected static Pair alignAndWrite(final VariantContext vc, final ReferenceContext ref) { + + final Pair retValue = new Pair(vc,0); + if (!vc.isIndel() || vc.isComplexIndel() ) { + return retValue; + } + + // get the indel length + final int indelLength; + if ( vc.isSimpleDeletion() ) + indelLength = vc.getReference().length() - 1; + else + indelLength = vc.getAlternateAllele(0).length() - 1; + + if ( indelLength > MAX_INDEL_LENGTH ) + return retValue; + + if (vc.getReference().getBases()[0] != vc.getAlternateAllele(0).getBases()[0]) + return retValue; + + final byte[] refSeq = ref.getBases(); + + // create an indel haplotype. + // + final int originalIndex = vc.getStart() - ref.getWindow().getStart() + 1; + if (originalIndex < 0 || originalIndex >= ref.getBases().length) + return retValue; + + final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); + + // create a CIGAR string to represent the event + ArrayList elements = new ArrayList(); + elements.add(new CigarElement(originalIndex, CigarOperator.M)); + elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I)); + elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); + Cigar originalCigar = new Cigar(elements); + + // left align the CIGAR + Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0, true); + + // update if necessary and write + if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { + int difference = originalIndex - newCigar.getCigarElement(0).getLength(); + VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); + //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); + + final int indelIndex = originalIndex-difference; + final byte[] newBases = new byte[indelLength + 1]; + newBases[0] = refSeq[indelIndex-1]; + System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 1, indelLength); + final Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); + newVC = updateAllele(newVC, newAllele); + // overwrite default return value with new left-aligned VC + retValue.first = newVC; + retValue.second = 1; + + } + return retValue; + } + + /** + * Make a haplotype from a given alt allele, using bases in input reference, index of an input reference + * @param vc Input VC - will use only alt allele from it + * @param ref Ref bases + * @param indexOfRef Index in ref where to create indel + * @param indelLength Indel length + * @return + */ + @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) + @Ensures("result != null") + private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { + byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; + + // add the bases before the indel + System.arraycopy(ref, 0, hap, 0, indexOfRef); + int currentPos = indexOfRef; + + // take care of the indel + if ( vc.isSimpleDeletion() ) { + indexOfRef += indelLength; + } else { + System.arraycopy(vc.getAlternateAllele(0).getBases(), 1, hap, currentPos, indelLength); + currentPos += indelLength; + } + + // add the bases after the indel + System.arraycopy(ref, indexOfRef, hap, currentPos, ref.length - indexOfRef); + + return hap; + } + + public static VariantContext updateAllele(final VariantContext vc, final Allele newAllele) { + // create a mapping from original allele to new allele + HashMap alleleMap = new HashMap(vc.getAlleles().size()); + if ( newAllele.isReference() ) { + alleleMap.put(vc.getReference(), newAllele); + alleleMap.put(vc.getAlternateAllele(0), Allele.create(newAllele.getBases()[0], false)); + } else { + alleleMap.put(vc.getReference(), Allele.create(newAllele.getBases()[0], true)); + alleleMap.put(vc.getAlternateAllele(0), newAllele); + } + + // create new Genotype objects + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + List newAlleles = new ArrayList(); + for ( Allele allele : genotype.getAlleles() ) { + Allele newA = alleleMap.get(allele); + if ( newA == null ) + newA = Allele.NO_CALL; + newAlleles.add(newA); + } + newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); + } + + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java diff --git a/public/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/JNAUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/JNAUtils.java diff --git a/public/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/LibC.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/LibC.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java new file mode 100644 index 000000000..1dc5f8516 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java @@ -0,0 +1,290 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.tools; + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import org.apache.log4j.BasicConfigurator; +import org.apache.log4j.Level; +import org.broad.tribble.AbstractFeatureReader; +import org.broad.tribble.FeatureReader; +import org.broad.tribble.index.IndexCreator; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.bcf2.BCF2Codec; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.Options; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.io.*; +import java.util.*; + + +/** + * + * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples + * + *

+ * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. + * This tool concatenates the scattered output VCF files. It assumes that: + * - All the input VCFs (or BCFs) contain the same samples in the same order. + * - The variants in each input file are from non-overlapping (scattered) intervals. + * + * When the input files are already sorted based on the intervals start positions, use -assumeSorted. + * + * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. + * + *

+ * + *

Input

+ *

+ * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). + * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'. + * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + *

+ * + *

Output

+ *

+ * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. + * <\p> + * + *

Important note

+ *

This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to + * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the + * classic "CommandLineGATK" arguments.

+ * + *

Example

+ *
+ * java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
+ *    -R ref.fasta \
+ *    -V input1.vcf \
+ *    -V input2.vcf \
+ *    -out output.vcf \
+ *    -assumeSorted
+ * 
+ * + * @author Ami Levy Moonshine + * @since Jan 2012 + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP ) +public class CatVariants extends CommandLineProgram { + // setup the logging system, used by some codecs + private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + + @Input(fullName = "reference", shortName = "R", doc = "genome reference file .fasta", required = true) + private File refFile = null; + + /** + * The VCF or BCF files to merge together + * + * CatVariants can take any number of -V arguments on the command line. Each -V argument + * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more + * efficiently if they are sorted based on the intervals and the assumeSorted argument is used. + * + */ + @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true) + private List variant = null; + + @Output(fullName = "outputFile", shortName = "out", doc = "output file name .vcf or .bcf", required = true) + private File outputFile = null; + + @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false) + private Boolean assumeSorted = false; + + @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false) + private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + + @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false) + private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; + + /* + * print usage information + */ + private static void printUsage() { + System.err.println("Usage: java -cp target/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants --reference --variant --outputFile [--assumeSorted]"); + System.err.println(" The input file(s) can be of type: VCF (must end in .vcf or .VCF) or"); + System.err.println(" BCF2 (must end in .bcf or .BCF)."); + System.err.println(" Output file must be of type vcf or bcf (must end in .vcf or .bcf)."); + System.err.println(" If the input files are already sorted, then indicate that with --assumeSorted to improve performance."); + } + + @Override + protected int execute() throws Exception { + //if(help){ + // printUsage(); + // return 1; + //} + + BasicConfigurator.configure(); + logger.setLevel(Level.INFO); + + final ReferenceSequenceFile ref; + try { + ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + } catch ( Exception e ) { + throw new UserException("Couldn't load provided reference sequence file " + refFile, e); + } + + Comparator> positionComparator = new PositionComparator(); + + + //PriorityQueue>> queue = + // new PriorityQueue>>(2000, comparator); + Queue> priorityQueue; + if(assumeSorted) + priorityQueue = new LinkedList>(); + else + priorityQueue = new PriorityQueue>(10000, positionComparator); + + Iterator files = variant.iterator(); + File file; + while (files.hasNext()) { + file = files.next(); + if (!(file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF") || file.getName().endsWith(".bcf") || file.getName().endsWith(".BCF"))){ + System.err.println("File " + file.getAbsolutePath() + " should be .vcf or .bcf"); + printUsage(); + return 1; + } + if (assumeSorted){ + priorityQueue.add(new Pair(0,file)); + } + else{ + if (!file.exists()) { + throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); + } + FeatureReader reader; + boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); + if(useVCF) + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); + else + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); + Iterator it = reader.iterator(); + if(!it.hasNext()){ + System.err.println(String.format("File %s is empty. This file will be ignored",file.getAbsolutePath())); + continue; + } + VariantContext vc = it.next(); + int firstPosition = vc.getStart(); + reader.close(); + //queue.add(new Pair>(firstPosition,reader)); + priorityQueue.add(new Pair(firstPosition,file)); + } + + } + + if (!(outputFile.getName().endsWith(".vcf") || outputFile.getName().endsWith(".VCF"))){ + throw new UserException(String.format("Output file %s should be .vcf", outputFile)); + } + + FileOutputStream outputStream = new FileOutputStream(outputFile); + EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); + final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile); + final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); + + boolean firstFile = true; + int count =0; + //while(!queue.isEmpty()){ + while(!priorityQueue.isEmpty() ){ + count++; + //FeatureReader reader = queue.remove().getSecond(); + file = priorityQueue.remove().getSecond(); + if (!file.exists()) { + throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); + } + FeatureReader reader; + boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); + if(useVCF) + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); + else + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); + + if(count%10 ==0) + System.out.print(count); + else + System.out.print("."); + if (firstFile){ + VCFHeader header = (VCFHeader)reader.getHeader(); + outputWriter.writeHeader(header); + firstFile = false; + } + + Iterator it = reader.iterator(); + + while (it.hasNext()){ + VariantContext vc = it.next(); + outputWriter.add(vc); + } + + reader.close(); + + } + System.out.println(); + + outputStream.close(); + outputWriter.close(); + + return 0; + } + + + public static void main(String[] args){ + try { + CatVariants instance = new CatVariants(); + start(instance, args); + System.exit(CommandLineProgram.result); + } catch ( UserException e ) { + printUsage(); + exitSystemWithUserError(e); + } catch ( Exception e ) { + exitSystemWithError(e); + } + } + + private static class PositionComparator implements Comparator> { + + @Override + public int compare(Pair p1, Pair p2) { + int startPositionP1 = p1.getFirst(); + int startPositionP2 = p2.getFirst(); + if (startPositionP1 == startPositionP2) + return 0; + return startPositionP1 < startPositionP2 ? -1 : 1 ; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/ListAnnotations.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/ListAnnotations.java diff --git a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/AutoFormattingTime.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/AutoFormattingTime.java diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/BaseUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BitSetUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BitSetUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/ContigComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/ContigComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/ContigComparator.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java new file mode 100644 index 000000000..f867f76c2 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java @@ -0,0 +1,96 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import it.unimi.dsi.fastutil.objects.Object2ObjectMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; + +import java.util.*; + +/** + * Utility class for handling deprecated tools gracefully + * + * @author vdauwera + * @since 3/11/13 + */ +public class DeprecatedToolChecks { + + // Mapping from walker name to major version number where the walker first disappeared and optional replacement options + private static Object2ObjectMap deprecatedGATKWalkers = new Object2ObjectOpenHashMap(); + static { + // Indicate recommended replacement in parentheses if applicable + deprecatedGATKWalkers.put("ReduceReads", "3.0 (use recommended best practices pipeline with the HaplotypeCaller)"); + deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); + deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)"); + deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)"); + deprecatedGATKWalkers.put("CountBestAlignments", "2.2 (no replacement)"); + deprecatedGATKWalkers.put("SomaticIndelDetector", "2.0 (replaced by the standalone tool Indelocator; see Cancer Tools documentation)"); + } + + // Mapping from walker name to major version number where the walker first disappeared and optional replacement options + private static Object2ObjectMap deprecatedGATKAnnotations = new Object2ObjectOpenHashMap(); + static { + // Same comments as for walkers + deprecatedGATKAnnotations.put("DepthOfCoverage", "2.4 (renamed to Coverage)"); + } + + /** + * Utility method to check whether a given walker has been deprecated in a previous GATK release + * + * @param walkerName the walker class name (not the full package) to check + */ + public static boolean isDeprecatedWalker(final String walkerName) { + return deprecatedGATKWalkers.containsKey(walkerName); + } + + /** + * Utility method to check whether a given annotation has been deprecated in a previous GATK release + * + * @param annotationName the annotation class name (not the full package) to check + */ + public static boolean isDeprecatedAnnotation(final String annotationName) { + return deprecatedGATKAnnotations.containsKey(annotationName); + } + + /** + * Utility method to pull up the version number at which a walker was deprecated and the suggested replacement, if any + * + * @param walkerName the walker class name (not the full package) to check + */ + public static String getWalkerDeprecationInfo(final String walkerName) { + return deprecatedGATKWalkers.get(walkerName).toString(); + } + + /** + * Utility method to pull up the version number at which an annotation was deprecated and the suggested replacement, if any + * + * @param annotationName the annotation class name (not the full package) to check + */ + public static String getAnnotationDeprecationInfo(final String annotationName) { + return deprecatedGATKAnnotations.get(annotationName).toString(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLoc.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLoc.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocParser.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocParser.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocSortedSet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocSortedSet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HasGenomeLocation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HasGenomeLocation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HeapSizeMonitor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HeapSizeMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/IndelUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/IndelUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/IndelUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/LRUCache.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/LRUCache.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/LRUCache.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/LRUCache.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MannWhitneyU.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MannWhitneyU.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MathUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MathUtils.java new file mode 100644 index 000000000..e73797705 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MathUtils.java @@ -0,0 +1,1576 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.apache.commons.math.distribution.ExponentialDistributionImpl; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.math.BigDecimal; +import java.util.*; + +/** + * MathUtils is a static class (no instantiation allowed!) with some useful math methods. + * + * @author Kiran Garimella + */ +public class MathUtils { + + /** + * Private constructor. No instantiating this class! + */ + private MathUtils() { + } + + public static final double[] log10Cache; + public static final double[] log10FactorialCache; + private static final double[] jacobianLogTable; + private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; + private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; + private static final double MAX_JACOBIAN_TOLERANCE = 8.0; + private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; + private static final int MAXN = 70_000; + private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients + + /** + * The smallest log10 value we'll emit from normalizeFromLog10 and other functions + * where the real-space value is 0.0. + */ + public static final double LOG10_P_OF_ZERO = -1000000.0; + public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); + public static final double LOG_ONE_HALF = -Math.log10(2.0); + public static final double LOG_ONE_THIRD = -Math.log10(3.0); + private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); + private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); + + static { + log10Cache = new double[LOG10_CACHE_SIZE]; + log10FactorialCache = new double[LOG10_CACHE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + + log10Cache[0] = Double.NEGATIVE_INFINITY; + log10FactorialCache[0] = 0.0; + for (int k = 1; k < LOG10_CACHE_SIZE; k++) { + log10Cache[k] = Math.log10(k); + log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; + } + + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); + + } + } + + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( final int min, final int max ) { + return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; + } + + // A fast implementation of the Math.round() method. This method does not perform + // under/overflow checking, so this shouldn't be used in the general case (but is fine + // if one is already make those checks before calling in to the rounding). + public static int fastRound(final double d) { + return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); + } + + public static double approximateLog10SumLog10(final double[] vals) { + return approximateLog10SumLog10(vals, vals.length); + } + + public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { + + final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); + double approxSum = vals[maxElementIndex]; + + for (int i = 0; i < endIndex; i++) { + if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) + continue; + + final double diff = approxSum - vals[i]; + if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) { + // See notes from the 2-inout implementation below + final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding + approxSum += MathUtils.jacobianLogTable[ind]; + } + } + + return approxSum; + } + + public static double approximateLog10SumLog10(final double a, final double b, final double c) { + return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); + } + + public static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if (small > big) { + final double t = big; + big = small; + small = t; + } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) + return big; + + final double diff = big - small; + if (diff >= MathUtils.MAX_JACOBIAN_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding + return big + MathUtils.jacobianLogTable[ind]; + } + + public static double sum(final double[] values) { + double s = 0.0; + for (double v : values) + s += v; + return s; + } + + public static long sum(final int[] x) { + long total = 0; + for (int v : x) + total += v; + return total; + } + + public static int sum(final byte[] x) { + int total = 0; + for (byte v : x) + total += (int)v; + return total; + } + + public static double percentage(int x, int base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static double ratio(final int num, final int denom) { + if ( denom > 0 ) { + return ((double) num)/denom; + } else { + if ( num == 0 && denom == 0) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + public static double ratio(final long num, final long denom) { + if ( denom > 0L ) { + return ((double) num)/denom; + } else { + if ( num == 0L && denom == 0L ) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + /** + * Converts a real space array of numbers (typically probabilities) into a log10 array + * + * @param prRealSpace + * @return + */ + public static double[] toLog10(final double[] prRealSpace) { + double[] log10s = new double[prRealSpace.length]; + for (int i = 0; i < prRealSpace.length; i++) { + log10s[i] = Math.log10(prRealSpace[i]); + } + return log10s; + } + + public static double log10sumLog10(final double[] log10p, final int start) { + return log10sumLog10(log10p, start, log10p.length); + } + + public static double log10sumLog10(final double[] log10p,final int start,final int finish) { + double sum = 0.0; + + double maxValue = arrayMax(log10p, finish); + if(maxValue == Double.NEGATIVE_INFINITY) + return maxValue; + + for (int i = start; i < finish; i++) { + if ( Double.isNaN(log10p[i]) || log10p[i] == Double.POSITIVE_INFINITY ) { + throw new IllegalArgumentException("log10p: Values must be non-infinite and non-NAN"); + } + sum += Math.pow(10.0, log10p[i] - maxValue); + } + + return Math.log10(sum) + maxValue; + } + + public static double sumLog10(final double[] log10values) { + return Math.pow(10.0, log10sumLog10(log10values)); + } + + public static double log10sumLog10(final double[] log10values) { + return log10sumLog10(log10values, 0); + } + + public static boolean wellFormedDouble(final double val) { + return !Double.isInfinite(val) && !Double.isNaN(val); + } + + public static double bound(final double value, final double minBoundary, final double maxBoundary) { + return Math.max(Math.min(value, maxBoundary), minBoundary); + } + + public static boolean isBounded(final double val, final double lower, final double upper) { + return val >= lower && val <= upper; + } + + public static boolean isPositive(final double val) { + return !isNegativeOrZero(val); + } + + public static boolean isPositiveOrZero(final double val) { + return isBounded(val, 0.0, Double.POSITIVE_INFINITY); + } + + public static boolean isNegativeOrZero(final double val) { + return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); + } + + public static boolean isNegative(final double val) { + return !isPositiveOrZero(val); + } + + /** + * Compares double values for equality (within 1e-6), or inequality. + * + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + */ + public static byte compareDoubles(final double a, final double b) { + return compareDoubles(a, b, 1e-6); + } + + /** + * Compares double values for equality (within epsilon), or inequality. + * + * @param a the first double value + * @param b the second double value + * @param epsilon the precision within which two double values will be considered equal + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + */ + public static byte compareDoubles(final double a, final double b, final double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } + return 1; + } + + /** + * Calculate f(x) = Normal(x | mu = mean, sigma = sd) + * @param mean the desired mean of the Normal distribution + * @param sd the desired standard deviation of the Normal distribution + * @param x the value to evaluate + * @return a well-formed double + */ + public static double normalDistribution(final double mean, final double sd, final double x) { + if( sd < 0 ) + throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); + if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) + throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); + return a * b; + } + + /** + * Calculate f(x) = log10 ( Normal(x | mu = mean, sigma = sd) ) + * @param mean the desired mean of the Normal distribution + * @param sd the desired standard deviation of the Normal distribution + * @param x the value to evaluate + * @return a well-formed double + */ + + public static double normalDistributionLog10(final double mean, final double sd, final double x) { + if( sd < 0 ) + throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); + if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) + throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); + final double a = -1.0 * Math.log10(sd * SQUARE_ROOT_OF_TWO_TIMES_PI); + final double b = -1.0 * (square(x - mean) / (2.0 * square(sd))) / NATURAL_LOG_OF_TEN; + return a + b; + } + + /** + * Calculate f(x) = x^2 + * @param x the value to square + * @return x * x + */ + public static double square(final double x) { + return x * x; + } + + /** + * Calculates the log10 of the binomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k number of successes + * @return the log10 of the binomial coefficient + */ + public static double binomialCoefficient(final int n, final int k) { + return Math.pow(10, log10BinomialCoefficient(n, k)); + } + + /** + * @see #binomialCoefficient(int, int) with log10 applied to result + */ + public static double log10BinomialCoefficient(final int n, final int k) { + if ( n < 0 ) { + throw new IllegalArgumentException("n: Must have non-negative number of trials"); + } + if ( k > n || k < 0 ) { + throw new IllegalArgumentException("k: Must have non-negative number of successes, and no more successes than number of trials"); + } + + return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); + } + + /** + * Computes a binomial probability. This is computed using the formula + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

+ * where n is the number of trials, k is the number of successes, and p is the probability of success + * + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + */ + public static double binomialProbability(final int n, final int k, final double p) { + return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); + } + + /** + * @see #binomialProbability(int, int, double) with log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k, final double log10p) { + if ( log10p > 1e-18 ) + throw new IllegalArgumentException("log10p: Log-probability must be 0 or less"); + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 + */ + public static double binomialProbability(final int n, final int k) { + return Math.pow(10, log10BinomialProbability(n, k)); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k) { + return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); + } + + /** A memoization container for {@link #binomialCumulativeProbability(int, int, int)}. Synchronized to accomodate multithreading. */ + private static final Map BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE = + Collections.synchronizedMap(new LRUCache(10_000)); + + /** + * Primitive integer-triplet bijection into long. Returns null when the bijection function fails (in lieu of an exception), which will + * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a + * utility function. + */ + static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { + if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { + return null; + } else { + long result = 0; + result += (short) one; + result <<= 16; + result += (short) two; + result <<= 16; + result += (short) three; + return result; + } + } + + /** + * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * Assumes that the probability of a successful hit is fair (i.e. 0.5). + * + * This pure function is memoized because of its expensive BigDecimal calculations. + * + * @param n number of attempts for the number of hits + * @param k_start start (inclusive) of the cumulant sum (over hits) + * @param k_end end (inclusive) of the cumulant sum (over hits) + * @return - returns the cumulative probability + */ + public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { + if ( k_end > n ) + throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); + + // Fetch cached value, if applicable. + final Long memoizationKey = fastGenerateUniqueHashFromThreeIntegers(n, k_start, k_end); + final Double memoizationCacheResult; + if (memoizationKey != null) { + memoizationCacheResult = BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.get(memoizationKey); + } else { + memoizationCacheResult = null; + } + + final double result; + if (memoizationCacheResult != null) { + result = memoizationCacheResult; + } else { + double cumProb = 0.0; + double prevProb; + BigDecimal probCache = BigDecimal.ZERO; + + for (int hits = k_start; hits <= k_end; hits++) { + prevProb = cumProb; + final double probability = binomialProbability(n, hits); + cumProb += probability; + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision + probCache = probCache.add(new BigDecimal(prevProb)); + cumProb = 0.0; + hits--; // repeat loop + // prevProb changes at start of loop + } + } + + result = probCache.add(new BigDecimal(cumProb)).doubleValue(); + if (memoizationKey != null) { + BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.put(memoizationKey, result); + } + } + return result; + } + + private static final double LOG1MEXP_THRESHOLD = Math.log(0.5); + + private static final double LN_10 = Math.log(10); + + /** + * Calculates {@code log(1-exp(a))} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + * + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log1mexp(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + + return (a < LOG1MEXP_THRESHOLD) ? Math.log1p(-Math.exp(a)) : Math.log(-Math.expm1(a)); + } + + /** + * Calculates {@code log10(1-10^a)} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10OneMinusPow10(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + final double b = a * LN_10; + return log1mexp(b) / LN_10; + } + + /** + * Calculates the log10 of the multinomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10MultinomialCoefficient(final int n, final int[] k) { + if ( n < 0 ) + throw new IllegalArgumentException("n: Must have non-negative number of trials"); + double denominator = 0.0; + int sum = 0; + for (int x : k) { + if ( x < 0 ) + throw new IllegalArgumentException("x element of k: Must have non-negative observations of group"); + if ( x > n ) + throw new IllegalArgumentException("x element of k, n: Group observations must be bounded by k"); + denominator += log10Factorial(x); + sum += x; + } + if ( sum != n ) + throw new IllegalArgumentException("k and n: Sum of observations in multinomial must sum to total number of trials"); + return log10Factorial(n) - denominator; + } + + /** + * Computes the log10 of the multinomial distribution probability given a vector + * of log10 probabilities. Designed to prevent overflows even with very large numbers. + * + * @param n number of trials + * @param k array of number of successes for each possibility + * @param log10p array of log10 probabilities + * @return + */ + public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { + if (log10p.length != k.length) + throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); + double log10Prod = 0.0; + for (int i = 0; i < log10p.length; i++) { + if ( log10p[i] > 1e-18 ) + throw new IllegalArgumentException("log10p: Log-probability must be <= 0"); + log10Prod += log10p[i] * k[i]; + } + return log10MultinomialCoefficient(n, k) + log10Prod; + } + + /** + * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. + * This is computed using the formula: + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

+ * where xi represents the number of times outcome i was observed, n is the number of total observations. + * In this implementation, the value of n is inferred as the sum over i of xi. + * + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. + */ + public static double multinomialCoefficient(final int[] k) { + int n = 0; + for (int xi : k) { + n += xi; + } + + return Math.pow(10, log10MultinomialCoefficient(n, k)); + } + + /** + * Computes a multinomial probability efficiently avoiding overflow even for large numbers. + * This is computed using the formula: + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

+ * where xi represents the number of times outcome i was observed, n is the number of total observations, and + * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is + * inferred as the sum over i of xi. + * + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. + */ + public static double multinomialProbability(final int[] k, final double[] p) { + if (p.length != k.length) + throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); + + int n = 0; + double[] log10P = new double[p.length]; + for (int i = 0; i < p.length; i++) { + log10P[i] = Math.log10(p[i]); + n += k[i]; + } + return Math.pow(10, log10MultinomialProbability(n, k, log10P)); + } + + /** + * calculate the Root Mean Square of an array of integers + * + * @param x an byte[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final byte[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** + * calculate the Root Mean Square of an array of integers + * + * @param x an int[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final int[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** + * calculate the Root Mean Square of an array of doubles + * + * @param x a double[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final Double[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (Double i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + public static double rms(final Collection l) { + if (l.size() == 0) + return 0.0; + + double rms = 0.0; + for (int i : l) + rms += i * i; + rms /= l.size(); + return Math.sqrt(rms); + } + + public static double distanceSquared(final double[] x, final double[] y) { + double dist = 0.0; + for (int iii = 0; iii < x.length; iii++) { + dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); + } + return dist; + } + + public static double round(final double num, final int digits) { + double result = num * Math.pow(10.0, (double) digits); + result = Math.round(result); + result = result / Math.pow(10.0, (double) digits); + return result; + } + + /** + * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). + * + * @param array the array to be normalized + * @param takeLog10OfOutput if true, the output will be transformed back into log10 units + * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed + */ + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { + return normalizeFromLog10(array, takeLog10OfOutput, false); + } + + /** + * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space + * + * @param array + * @param takeLog10OfOutput + * @param keepInLogSpace + * + * @return + */ + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = arrayMax(array); + + // we may decide to just normalize in log space without converting to linear space + if (keepInLogSpace) { + for (int i = 0; i < array.length; i++) { + array[i] -= maxValue; + } + return array; + } + + // default case: go to linear space + double[] normalized = new double[array.length]; + + for (int i = 0; i < array.length; i++) + normalized[i] = Math.pow(10, array[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.length; i++) + sum += normalized[i]; + for (int i = 0; i < array.length; i++) { + double x = normalized[i] / sum; + if (takeLog10OfOutput) { + x = Math.log10(x); + if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) + x = array[i] - maxValue; + } + + normalized[i] = x; + } + + return normalized; + } + + /** + * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + public static double[] normalizeFromLog10(final double[] array) { + return normalizeFromLog10(array, false); + } + + /** + * normalizes the real-space probability array. + * + * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok + * to have values in the array of > 1, or have the sum go above 0. + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + @Requires("array != null") + @Ensures({"result != null"}) + public static double[] normalizeFromRealSpace(final double[] array) { + if ( array.length == 0 ) + return array; + + final double sum = sum(array); + final double[] normalized = new double[array.length]; + if ( sum < 0.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); + for ( int i = 0; i < array.length; i++ ) { + normalized[i] = array[i] / sum; + } + return normalized; + } + + public static int maxElementIndex(final double[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final double[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int maxElementIndex(final int[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final byte[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final int[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int maxElementIndex(final byte[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int arrayMax(final int[] array) { + return array[maxElementIndex(array)]; + } + + + public static double arrayMax(final double[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMax(final double[] array, final int endIndex) { + return array[maxElementIndex(array, endIndex)]; + } + + public static double arrayMin(final double[] array) { + return array[minElementIndex(array)]; + } + + public static int arrayMin(final int[] array) { + return array[minElementIndex(array)]; + } + + public static byte arrayMin(final byte[] array) { + return array[minElementIndex(array)]; + } + + /** + * Compute the min element of a List + * @param array a non-empty list of integer + * @return the min + */ + public static int arrayMin(final List array) { + if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); + int min = array.get(0); + for ( final int i : array ) + if ( i < min ) min = i; + return min; + } + + /** + * Compute the median element of the list of integers + * @param array a list of integers + * @return the median element + */ + public static > T median(final List array) { + /* TODO -- from Valentin + the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). + + But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] + + My suggestion for a solution is then: + + unify median and medianDoubles to public static T median(Collection) + check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. + relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) + In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) + */ + if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); + final int size = array.size(); + if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); + else if ( size == 1 ) return array.get(0); + else { + final ArrayList sorted = new ArrayList<>(array); + Collections.sort(sorted); + return sorted.get(size / 2); + } + } + + public static int minElementIndex(final double[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int minElementIndex(final byte[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int minElementIndex(final int[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int arrayMaxInt(final List array) { + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) + throw new IllegalArgumentException("Array size cannot be 0!"); + + int m = array.get(0); + for (int e : array) + m = Math.max(m, e); + return m; + } + + public static int sum(final List list ) { + int sum = 0; + for ( Integer i : list ) { + sum += i; + } + return sum; + } + + public static double average(final List vals, final int maxI) { + long sum = 0L; + + int i = 0; + for (long x : vals) { + if (i > maxI) + break; + sum += x; + i++; + } + + return (1.0 * sum) / i; + } + + public static double average(final List vals) { + return average(vals, vals.size()); + } + + public static int countOccurrences(final char c, final String s) { + int count = 0; + for (int i = 0; i < s.length(); i++) { + count += s.charAt(i) == c ? 1 : 0; + } + return count; + } + + public static int countOccurrences(T x, List l) { + int count = 0; + for (T y : l) { + if (x.equals(y)) + count++; + } + + return count; + } + + public static int countOccurrences(byte element, byte[] array) { + int count = 0; + for (byte y : array) { + if (element == y) + count++; + } + + return count; + } + + public static int countOccurrences(final boolean element, final boolean[] array) { + int count = 0; + for (final boolean b : array) { + if (element == b) + count++; + } + + return count; + } + + + /** + * Returns n random indices drawn with replacement from the range 0..(k-1) + * + * @param n the total number of indices sampled from + * @param k the number of random indices to draw (with replacement) + * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates + */ + static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { + + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { + //Integer chosen_ball = balls[rand.nextInt(k)]; + chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); + //balls.remove(chosen_ball); + } + + return chosen_balls; + } + + /** + * Returns n random indices drawn without replacement from the range 0..(k-1) + * + * @param n the total number of indices sampled from + * @param k the number of random indices to draw (without replacement) + * @return a list of k random indices ranging from 0 to (n-1) without duplicates + */ + static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { + ArrayList chosen_balls = new ArrayList(k); + + for (int i = 0; i < n; i++) { + chosen_balls.add(i); + } + + Collections.shuffle(chosen_balls, GenomeAnalysisEngine.getRandomGenerator()); + + //return (ArrayList) chosen_balls.subList(0, k); + return new ArrayList(chosen_balls.subList(0, k)); + } + + /** + * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices + */ + static public ArrayList sliceListByIndices(final List indices, final List list) { + ArrayList subset = new ArrayList(); + + for (int i : indices) { + subset.add(list.get(i)); + } + + return subset; + } + + /** + * Given two log-probability vectors, compute log of vector product of them: + * in Matlab notation, return log10(10.*x'*10.^y) + * @param x vector 1 + * @param y vector 2 + * @return a double representing log (dotProd(10.^x,10.^y) + */ + public static double logDotProduct(final double [] x, final double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Vectors of different lengths"); + + double tmpVec[] = new double[x.length]; + + for (int k=0; k < tmpVec.length; k++ ) { + tmpVec[k] = x[k]+y[k]; + } + + return log10sumLog10(tmpVec); + + + + } + + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value. By default allows + * -Infinity values, as log10(0.0) == -Infinity. + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return goodLog10Probability(result, true); + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @param allowNegativeInfinity should we consider a -Infinity value ok? + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result, final boolean allowNegativeInfinity) { + return result <= 0.0 && result != Double.POSITIVE_INFINITY && (allowNegativeInfinity || result != Double.NEGATIVE_INFINITY) && ! Double.isNaN(result); + } + + /** + * Checks that the result is a well-formed probability + * + * @param result a supposedly well-formed probability value + * @return true if result is really well formed + */ + public static boolean goodProbability(final double result) { + return result >= 0.0 && result <= 1.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. + * The number of observations does not have to be known in advance, and can be also very big (so that + * it could overflow any naive summation-based scheme or cause loss of precision). + * Instead, adding a new number observed + * to a sample with add(observed) immediately updates the instance of this object so that + * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 + * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). + */ + public static class RunningAverage { + private double mean = 0.0; + private double s = 0.0; + private long obs_count = 0; + + public void add(double obs) { + obs_count++; + double oldMean = mean; + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); + } + + public void addAll(Collection col) { + for (Number o : col) { + add(o.doubleValue()); + } + } + + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } + + public RunningAverage clone() { + RunningAverage ra = new RunningAverage(); + ra.mean = this.mean; + ra.s = this.s; + ra.obs_count = this.obs_count; + return ra; + } + + public void merge(RunningAverage other) { + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); + this.s += other.s; + } + this.obs_count += other.obs_count; + } + } + + // + // useful common utility routines + // + + static public double max(double x0, double x1, double x2) { + double a = Math.max(x0, x1); + return Math.max(a, x2); + } + + /** + * Converts LN to LOG10 + * + * @param ln log(x) + * @return log10(x) + */ + public static double lnToLog10(final double ln) { + return ln * Math.log10(Math.E); + } + + /** + * Constants to simplify the log gamma function calculation. + */ + private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; + + /** + * Efficient rounding functions to simplify the log gamma function calculation + * double to long with 32 bit shift + */ + private static final int HI(final double x) { + return (int) (Double.doubleToLongBits(x) >> 32); + } + + /** + * Efficient rounding functions to simplify the log gamma function calculation + * double to long without shift + */ + private static final int LO(final double x) { + return (int) Double.doubleToLongBits(x); + } + + /** + * Most efficent implementation of the lnGamma (FDLIBM) + * Use via the log10Gamma wrapper method. + */ + private static double lnGamma(final double x) { + double t, y, z, p, p1, p2, p3, q, r, w; + int i; + + int hx = HI(x); + int lx = LO(x); + + /* purge off +-inf, NaN, +-0, and negative arguments */ + int ix = hx & 0x7fffffff; + if (ix >= 0x7ff00000) + return Double.POSITIVE_INFINITY; + if ((ix | lx) == 0 || hx < 0) + return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + return -Math.log(x); + } + + /* purge off 1 and 2 */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) + r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -Math.log(x); + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } + else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } + else { + y = x; + i = 2; + } + } + else { + r = zero; + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ + else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ + else { + y = x - one; + i = 2; + } + } + + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); + } + } + else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; + t = zero; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; + } + /* 8.0 <= x < 2**58 */ + } + else if (ix < 0x43900000) { + t = Math.log(x); + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; + } + else + /* 2**58 <= x <= inf */ + r = x * (Math.log(x) - one); + return r; + } + + /** + * Calculates the log10 of the gamma function for x using the efficient FDLIBM + * implementation to avoid overflows and guarantees high accuracy even for large + * numbers. + * + * @param x the x parameter + * @return the log10 of the gamma function at x. + */ + public static double log10Gamma(final double x) { + return lnToLog10(lnGamma(x)); + } + + public static double factorial(final int x) { + // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value + return (double)Math.round(Math.pow(10, log10Factorial(x))); + } + + public static double log10Factorial(final int x) { + if (x >= log10FactorialCache.length || x < 0) + return log10Gamma(x + 1); + else + return log10FactorialCache[x]; + } + + /** + * Adds two arrays together and returns a new array with the sum. + * + * @param a one array + * @param b another array + * @return a new array with the sum of a and b + */ + @Requires("a.length == b.length") + @Ensures("result.length == a.length") + public static int[] addArrays(final int[] a, final int[] b) { + int[] c = new int[a.length]; + for (int i = 0; i < a.length; i++) + c[i] = a[i] + b[i]; + return c; + } + + /** Same routine, unboxed types for efficiency + * + * @param x First vector + * @param y Second vector + * @return Vector of same length as x and y so that z[k] = x[k]+y[k] + */ + public static double[] vectorSum(final double[]x, final double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); + + double[] result = new double[x.length]; + for (int k=0; k log10LinearRange(final int start, final int stop, final double eps) { + final LinkedList values = new LinkedList<>(); + final double log10range = Math.log10(stop - start); + + if ( start == 0 ) + values.add(0); + + double i = 0.0; + while ( i <= log10range ) { + final int index = (int)Math.round(Math.pow(10, i)) + start; + if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) + values.add(index); + i += eps; + } + + if ( values.peekLast() == null || values.peekLast() != stop ) + values.add(stop); + + return values; + } + + /** + * Compute in a numerical correct way the quantity log10(1-x) + * + * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow + * in 1-x when x is very small + * + * @param x a positive double value between 0.0 and 1.0 + * @return an estimate of log10(1-x) + */ + @Requires("x >= 0.0 && x <= 1.0") + @Ensures("result <= 0.0") + public static double log10OneMinusX(final double x) { + if ( x == 1.0 ) + return Double.NEGATIVE_INFINITY; + else if ( x == 0.0 ) + return 0.0; + else { + final double d = Math.log10(1 / x - 1) + Math.log10(x); + return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; + } + } + + /** + * Draw N random elements from list + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSubset(final List list, final int N) { + if (list.size() <= N) { + return list; + } + + return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); + } + + /** + * Draw N random elements from list with replacement + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSample(final List list, final int N) { + return sliceListByIndices(sampleIndicesWithReplacement(list.size(),N),list); + } + + /** + * Return the likelihood of observing the counts of categories having sampled a population + * whose categorial frequencies are distributed according to a Dirichlet distribution + * @param dirichletParams - params of the prior dirichlet distribution + * @param dirichletSum - the sum of those parameters + * @param counts - the counts of observation in each category + * @param countSum - the sum of counts (number of trials) + * @return - associated likelihood + */ + public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, + final int[] counts, final int countSum) { + if ( dirichletParams.length != counts.length ) { + throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); + } + // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) + double likelihood = log10MultinomialCoefficient(countSum,counts); + likelihood += log10Gamma(dirichletSum); + likelihood -= log10Gamma(dirichletSum+countSum); + for ( int idx = 0; idx < counts.length; idx++ ) { + likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); + likelihood -= log10Gamma(dirichletParams[idx]); + } + + return likelihood; + } + + public static double dirichletMultinomial(double[] params, int[] counts) { + return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); + } + + public static ExponentialDistribution exponentialDistribution( final double mean ) { + return new ExponentialDistributionImpl(mean); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Median.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/Median.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Median.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MendelianViolation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MendelianViolation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/NGSPlatform.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/NGSPlatform.java diff --git a/public/java/src/org/broadinstitute/sting/utils/PathUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/PathUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/PathUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/PathUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/QualityUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/QualityUtils.java new file mode 100644 index 000000000..543923dd6 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/QualityUtils.java @@ -0,0 +1,397 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import net.sf.samtools.SAMUtils; + +/** + * QualityUtils is a static class (no instantiation allowed!) with some utility methods for manipulating + * quality scores. + * + * @author Kiran Garimella, Mark DePristo + * @since Way back + */ +public class QualityUtils { + /** + * Maximum quality score that can be encoded in a SAM/BAM file + */ + public final static byte MAX_SAM_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; + + + private final static double RAW_MIN_PHRED_SCALED_QUAL = Math.log10(Double.MIN_VALUE); + protected final static double MIN_PHRED_SCALED_QUAL = -10.0 * RAW_MIN_PHRED_SCALED_QUAL; + + /** + * bams containing quals above this value are extremely suspicious and we should warn the user + */ + public final static byte MAX_REASONABLE_Q_SCORE = 60; + + /** + * The lowest quality score for a base that is considered reasonable for statistical analysis. This is + * because Q 6 => you stand a 25% of being right, which means all bases are equally likely + */ + public final static byte MIN_USABLE_Q_SCORE = 6; + public final static int MAPPING_QUALITY_UNAVAILABLE = 255; + + /** + * Maximum sense quality value. + */ + public static final int MAX_QUAL = 254; + + /** + * Cached values for qual as byte calculations so they are very fast + */ + private static double qualToErrorProbCache[] = new double[MAX_QUAL + 1]; + private static double qualToProbLog10Cache[] = new double[MAX_QUAL + 1]; + + + static { + for (int i = 0; i <= MAX_QUAL; i++) { + qualToErrorProbCache[i] = qualToErrorProb((double) i); + qualToProbLog10Cache[i] = Math.log10(1.0 - qualToErrorProbCache[i]); + } + } + + /** + * Private constructor. No instantiating this class! + */ + private QualityUtils() {} + + // ---------------------------------------------------------------------- + // + // These are all functions to convert a phred-scaled quality score to a probability + // + // ---------------------------------------------------------------------- + + /** + * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a discretized byte value, this function uses a cache so is very efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a quality score (0-255) + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToProb(final byte qual) { + return 1.0 - qualToErrorProb(qual); + } + + /** + * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToProb(final double qual) { + if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); + return 1.0 - qualToErrorProb(qual); + } + + /** + * Convert a phred-scaled quality score to its log10 probability of being true (Q30 => log10(0.999)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + @Ensures("result <= 0.0") + public static double qualToProbLog10(final byte qual) { + return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + + /** + * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToErrorProb(final double qual) { + if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); + return Math.pow(10.0, qual / -10.0); + } + + /** + * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a byte value, this function uses a cache so is very efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a byte + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToErrorProb(final byte qual) { + return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + + + /** + * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * The calculation is extremely efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a byte + * @return a probability (0.0-1.0) + */ + @Ensures("result <= 0.0") + public static double qualToErrorProbLog10(final byte qual) { + return qualToErrorProbLog10((double)(qual & 0xFF)); + } + + /** + * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * The calculation is extremely efficient + * + * @param qual a phred-scaled quality score encoded as a double + * @return a probability (0.0-1.0) + */ + @Ensures("result <= 0.0") + public static double qualToErrorProbLog10(final double qual) { + if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); + return qual / -10.0; + } + + // ---------------------------------------------------------------------- + // + // Functions to convert a probability to a phred-scaled quality score + // + // ---------------------------------------------------------------------- + + /** + * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). + * + * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE + * and by 1 at the low-end. + * + * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) + * @return a quality score (0-MAX_SAM_QUAL_SCORE) + */ + public static byte errorProbToQual(final double errorRate) { + return errorProbToQual(errorRate, MAX_SAM_QUAL_SCORE); + } + + /** + * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). + * + * Note, this function caps the resulting quality score by the public static value MIN_REASONABLE_ERROR + * and by 1 at the low-end. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) + * @return a quality score (0-maxQual) + */ + public static byte errorProbToQual(final double errorRate, final byte maxQual) { + if ( ! MathUtils.goodProbability(errorRate) ) throw new IllegalArgumentException("errorRate must be good probability but got " + errorRate); + final double d = Math.round(-10.0*Math.log10(errorRate)); + return boundQual((int)d, maxQual); + } + + /** + * @see #errorProbToQual(double, byte) with proper conversion of maxQual integer to a byte + */ + public static byte errorProbToQual(final double prob, final int maxQual) { + if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); + return errorProbToQual(prob, (byte)(maxQual & 0xFF)); + } + + /** + * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). + * + * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE + * and by 1 at the low-end. + * + * @param prob a probability (0.0-1.0) of being right + * @return a quality score (0-MAX_SAM_QUAL_SCORE) + */ + public static byte trueProbToQual(final double prob) { + return trueProbToQual(prob, MAX_SAM_QUAL_SCORE); + } + + /** + * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). + * + * Note, this function caps the resulting quality score by the min probability allowed (EPS). + * So for example, if prob is 1e-6, which would imply a Q-score of 60, and EPS is 1e-4, + * the result of this function is actually Q40. + * + * Note that the resulting quality score, regardless of EPS, is capped by MAX_SAM_QUAL_SCORE and + * bounded on the low-side by 1. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param trueProb a probability (0.0-1.0) of being right + * @param maxQual the maximum quality score we are allowed to emit here, regardless of the error rate + * @return a phred-scaled quality score (0-maxQualScore) as a byte + */ + @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") + public static byte trueProbToQual(final double trueProb, final byte maxQual) { + if ( ! MathUtils.goodProbability(trueProb) ) throw new IllegalArgumentException("trueProb must be good probability but got " + trueProb); + final double lp = Math.round(-10.0*MathUtils.log10OneMinusX(trueProb)); + return boundQual((int)lp, maxQual); + } + + /** + * @see #trueProbToQual(double, byte) with proper conversion of maxQual to a byte + */ + public static byte trueProbToQual(final double prob, final int maxQual) { + if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); + return trueProbToQual(prob, (byte)(maxQual & 0xFF)); + } + + /** + * Convert a probability of being right to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param trueRate the probability of being right (0.0-1.0) + * @return a phred-scaled version of the error rate implied by trueRate + */ + @Ensures("result >= 0.0") + public static double phredScaleCorrectRate(final double trueRate) { + return phredScaleLog10ErrorRate(MathUtils.log10OneMinusX(trueRate)); + } + + /** + * Convert a log10 probability of being right to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param trueRateLog10 the log10 probability of being right (0.0-1.0). Can be -Infinity to indicate + * that the result is impossible in which MIN_PHRED_SCALED_QUAL is returned + * @return a phred-scaled version of the error rate implied by trueRate + */ + @Ensures("result >= 0.0") + public static double phredScaleLog10CorrectRate(final double trueRateLog10) { + return phredScaleCorrectRate(Math.pow(10.0, trueRateLog10)); + } + + /** + * Convert a probability of being wrong to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param errorRate the probability of being wrong (0.0-1.0) + * @return a phred-scaled version of the error rate + */ + @Ensures("result >= 0.0") + public static double phredScaleErrorRate(final double errorRate) { + return phredScaleLog10ErrorRate(Math.log10(errorRate)); + } + + /** + * Convert a log10 probability of being wrong to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param errorRateLog10 the log10 probability of being wrong (0.0-1.0). Can be -Infinity, in which case + * the result is MIN_PHRED_SCALED_QUAL + * @return a phred-scaled version of the error rate + */ + @Ensures("result >= 0.0") + public static double phredScaleLog10ErrorRate(final double errorRateLog10) { + if ( ! MathUtils.goodLog10Probability(errorRateLog10) ) throw new IllegalArgumentException("errorRateLog10 must be good probability but got " + errorRateLog10); + // abs is necessary for edge base with errorRateLog10 = 0 producing -0.0 doubles + return Math.abs(-10.0 * Math.max(errorRateLog10, RAW_MIN_PHRED_SCALED_QUAL)); + } + + // ---------------------------------------------------------------------- + // + // Routines to bound a quality score to a reasonable range + // + // ---------------------------------------------------------------------- + + /** + * Return a quality score that bounds qual by MAX_SAM_QUAL_SCORE and 1 + * + * @param qual the uncapped quality score as an integer + * @return the bounded quality score + */ + @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (MAX_SAM_QUAL_SCORE & 0xFF)") + public static byte boundQual(int qual) { + return boundQual(qual, MAX_SAM_QUAL_SCORE); + } + + /** + * Return a quality score that bounds qual by maxQual and 1 + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual the uncapped quality score as an integer. Can be < 0 (which may indicate an error in the + * client code), which will be brought back to 1, but this isn't an error, as some + * routines may use this functionality (BaseRecalibrator, for example) + * @param maxQual the maximum quality score, must be less < 255 + * @return the bounded quality score + */ + @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") + public static byte boundQual(final int qual, final byte maxQual) { + return (byte) (Math.max(Math.min(qual, maxQual & 0xFF), 1) & 0xFF); + } + + } + + diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutorException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutorException.java diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptLibrary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptLibrary.java diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/SampleUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java new file mode 100644 index 000000000..59516f196 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java @@ -0,0 +1,261 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import org.apache.log4j.Logger; + +import java.text.NumberFormat; +import java.util.concurrent.TimeUnit; +import static java.lang.Math.abs; + +/** + * A useful simple system for timing code with nano second resolution + * + * Note that this code is not thread-safe. If you have a single timer + * being started and stopped by multiple threads you will need to protect the + * calls to avoid meaningless results of having multiple starts and stops + * called sequentially. + * + * This timer has been modified to provide better semantics for dealing with + * system-level checkpoint and restarting. Such events can cause the internal JVM + * clock to be reset, breaking timings based upon it. Whilst this is difficult to + * counter without getting explicit notice of checkpoint events, we try to moderate + * the symptoms through tracking the offset between the system clock and the JVM clock. + * If this offset grows drastically (greater than CLOCK_DRIFT), we infer a JVM restart + * and reset the timer. + * + * User: depristo + * Date: Dec 10, 2010 + * Time: 9:07:44 AM + */ +public class SimpleTimer { + private final static Logger logger = Logger.getLogger(SimpleTimer.class); + protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private static final long MILLI_TO_NANO= TimeUnit.MILLISECONDS.toNanos(1); + private static final ThreadLocal NUMBER_FORMAT = new ThreadLocal() { + @Override + protected NumberFormat initialValue() { + return NumberFormat.getIntegerInstance(); + } + }; + + /** + * Allowable clock drift in nanoseconds. + */ + private static final long CLOCK_DRIFT = TimeUnit.SECONDS.toNanos(5); + private final String name; + + /** + * The difference between system time and JVM time at last sync. + * This is used to detect JVM checkpoint/restart events, and should be + * reset when a JVM checkpoint/restart is detected. + */ + private long nanoTimeOffset; + + /** + * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the + * sum of times between starts/restrats and stops. + */ + private long elapsedTimeNano = 0l; + + /** + * The start time of the last start/restart in nanoSeconds + */ + private long startTimeNano = 0l; + + /** + * Is this timer currently running (i.e., the last call was start/restart) + */ + private boolean running = false; + + /** + * Creates an anonymous simple timer + */ + public SimpleTimer() { + this("Anonymous"); + } + + /** + * Creates a simple timer named name + * @param name of the timer, must not be null + */ + public SimpleTimer(final String name) { + if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); + this.name = name; + + this.nanoTimeOffset = getNanoOffset(); + } + + /** + * @return the name associated with this timer + */ + public synchronized String getName() { + return name; + } + + /** + * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to + * resetting the time to have no history at all. + * + * @return this object, for programming convenience + */ + @Ensures("elapsedTimeNano == 0l") + public synchronized SimpleTimer start() { + elapsedTimeNano = 0l; + return restart(); + } + + /** + * Starts the timer running, without resetting the elapsedTimeNano time. This function may be + * called without first calling start(). The only difference between start and restart + * is that start resets the elapsedTimeNano time, while restart does not. + * + * @return this object, for programming convenience + */ + public synchronized SimpleTimer restart() { + running = true; + startTimeNano = currentTimeNano(); + nanoTimeOffset = getNanoOffset(); + return this; + } + + /** + * @return is this timer running? + */ + public synchronized boolean isRunning() { + return running; + } + + /** + * @return A convenience function to obtain the current time in milliseconds from this timer + */ + public long currentTime() { + return System.currentTimeMillis(); + } + + /** + * @return A convenience function to obtain the current time in nanoSeconds from this timer + */ + public long currentTimeNano() { + return System.nanoTime(); + } + + /** + * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * This method calls `ensureClockSync` to make sure that the JVM and system clocks + * are roughly in sync since the start of the timer. If they are not, then the time + * elapsed since the previous 'stop' will not be added to the timer. + * + * It's ok to call stop on a timer that's not running. It has no effect on the timer. + * + * @return this object, for programming convenience + */ + @Requires("startTimeNano != 0l") + public synchronized SimpleTimer stop() { + if ( running ) { + running = false; + if (ensureClockSync()) { + elapsedTimeNano += currentTimeNano() - startTimeNano; + } + } + return this; + } + + /** + * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently + * running, includes the difference from currentTime() and the start as well + * + * @return this time, in seconds + */ + public synchronized double getElapsedTime() { + return nanoToSecondsAsDouble(getElapsedTimeNano()); + } + + protected static double nanoToSecondsAsDouble(final long nano) { + return nano * NANO_TO_SECOND_DOUBLE; + } + + /** + * @see #getElapsedTime() but returns the result in nanoseconds + * + * @return the elapsed time in nanoseconds + */ + public synchronized long getElapsedTimeNano() { + if (running && ensureClockSync()) { + return currentTimeNano() - startTimeNano + elapsedTimeNano; + } else { + return elapsedTimeNano; + } + } + + /** + * Add the elapsed time from toAdd to this elapsed time + * + * @param toAdd the timer whose elapsed time we want to add to this timer + */ + public synchronized void addElapsed(final SimpleTimer toAdd) { + elapsedTimeNano += toAdd.getElapsedTimeNano(); + } + + /** + * Get the current offset of nano time from system time. + */ + private static long getNanoOffset() { + return System.nanoTime() - (System.currentTimeMillis() * MILLI_TO_NANO); + } + + /** + * Ensure that the JVM time has remained in sync with system time. + * This will also reset the clocks to avoid gradual drift. + * + * @return true if the clocks are in sync, false otherwise + */ + private boolean ensureClockSync() { + final long currentOffset = getNanoOffset(); + final long diff = abs(currentOffset - nanoTimeOffset); + final boolean ret = (diff <= CLOCK_DRIFT); + if (!ret) { + final NumberFormat numberFormat = NUMBER_FORMAT.get(); + final String msg = String.format( + "Clock drift of %s - %s = %s nanoseconds detected, vs. max allowable drift of %s. " + + "Assuming checkpoint/restart event.", + numberFormat.format(currentOffset), + numberFormat.format(nanoTimeOffset), + numberFormat.format(diff), + numberFormat.format(CLOCK_DRIFT)); + // Log message + logger.warn(msg); + } + // Reset the drift meter to stay in sync. + this.nanoTimeOffset = currentOffset; + return ret; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/Utils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java new file mode 100644 index 000000000..0c819b4fb --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -0,0 +1,500 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.activeregion; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Represents a single active region created by the Active Region Traversal for processing + * + * An active region is a single contiguous span of bases on the genome that should be operated + * on as a single unit for the active region traversal. The action may contains a list of + * reads that overlap the region (may because there may be no reads in the region). The region + * is tagged as being either active or inactive, depending on the probabilities provided by + * the isActiveProb results from the ART walker. Each region carries with it the + * exact span of the region (bases which are the core of the isActiveProbs from the walker) as + * well as an extended size, that includes the ART walker's extension size. Reads in the region + * provided by ART include all reads overlapping the extended span, not the raw span. + * + * User: rpoplin + * Date: 1/4/12 + */ +@Invariant({ + "extension >= 0", + "activeRegionLoc != null", + "genomeLocParser != null", + "spanIncludingReads != null", + "extendedLoc != null" +}) +public class ActiveRegion implements HasGenomeLocation { + /** + * The reads included in this active region. May be empty upon creation, and expand / contract + * as reads are added or removed from this region. + */ + private final List reads = new ArrayList(); + + /** + * An ordered list (by genomic coordinate) of the ActivityProfileStates that went + * into this active region. May be empty, which says that no supporting states were + * provided when this region was created. + */ + private final List supportingStates; + + /** + * The raw span of this active region, not including the active region extension + */ + private final GenomeLoc activeRegionLoc; + + /** + * The span of this active region on the genome, including the active region extension + */ + private final GenomeLoc extendedLoc; + + /** + * The extension, in bp, of this active region. + */ + private final int extension; + + /** + * A genomeLocParser so we can create genomeLocs + */ + private final GenomeLocParser genomeLocParser; + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + private final boolean isActive; + + /** + * The span of this active region, including the bp covered by all reads in this + * region. This union of extensionLoc and the loc of all reads in this region. + * + * Must be at least as large as extendedLoc, but may be larger when reads + * partially overlap this region. + */ + private GenomeLoc spanIncludingReads; + + + /** + * Indicates whether the active region has been finalized + */ + private boolean hasBeenFinalized; + + /** + * Create a new ActiveRegion containing no reads + * + * @param activeRegionLoc the span of this active region + * @param supportingStates the states that went into creating this region, or null / empty if none are available. + * If not empty, must have exactly one state for each bp in activeRegionLoc + * @param isActive indicates whether this is an active region, or an inactve one + * @param genomeLocParser a non-null parser to let us create new genome locs + * @param extension the active region extension to use for this active region + */ + public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null"); + if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc); + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension); + + this.activeRegionLoc = activeRegionLoc; + this.supportingStates = supportingStates == null ? Collections.emptyList() : new ArrayList(supportingStates); + this.isActive = isActive; + this.genomeLocParser = genomeLocParser; + this.extension = extension; + this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + this.spanIncludingReads = extendedLoc; + + if ( ! this.supportingStates.isEmpty() ) { + if ( this.supportingStates.size() != activeRegionLoc.size() ) + throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size()); + GenomeLoc lastStateLoc = null; + for ( final ActivityProfileState state : this.supportingStates ) { + if ( lastStateLoc != null ) { + if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex()) + throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state); + } + lastStateLoc = state.getLoc(); + } + } + } + + /** + * Simple interface to create an active region that isActive without any profile state + */ + public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) { + this(activeRegionLoc, Collections.emptyList(), true, genomeLocParser, extension); + } + + @Override + public String toString() { + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); + } + + /** + * See #getActiveRegionReference but with padding == 0 + */ + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + return getActiveRegionReference(referenceReader, 0); + } + + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, extendedLoc); + } + + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + return getFullReference(referenceReader, 0); + } + + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, spanIncludingReads); + } + + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); + if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); + if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); + if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); + + final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), + Math.max(1, genomeLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); + + return reference; + } + + /** + * Get the raw span of this active region (excluding the extension) + * @return a non-null genome loc + */ + @Override + @Ensures("result != null") + public GenomeLoc getLocation() { return activeRegionLoc; } + + /** + * Get the span of this active region including the extension value + * @return a non-null GenomeLoc + */ + @Ensures("result != null") + public GenomeLoc getExtendedLoc() { return extendedLoc; } + + /** + * Get the span of this active region including the extension and the projects on the + * genome of all reads in this active region. That is, returns the bp covered by this + * region and all reads in the region. + * @return a non-null genome loc + */ + @Ensures("result != null") + public GenomeLoc getReadSpanLoc() { return spanIncludingReads; } + + /** + * Get the active profile states that went into creating this region, if possible + * @return an unmodifiable list of states that led to the creation of this region, or an empty + * list if none were provided + */ + @Ensures("result != null") + public List getSupportingStates() { + return Collections.unmodifiableList(supportingStates); + } + + /** + * Get the active region extension applied to this region + * + * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions + * + * @return the size in bp of the region extension + */ + @Ensures("result >= 0") + public int getExtension() { return extension; } + + /** + * Get an unmodifiable list of reads currently in this active region. + * + * The reads are sorted by their coordinate position + * + * @return an unmodifiable list of reads in this active region + */ + @Ensures("result != null") + public List getReads() { + return Collections.unmodifiableList(reads); + } + + /** + * Get the number of reads currently in this active region + * @return an integer >= 0 + */ + @Ensures("result >= 0") + public int size() { return reads.size(); } + + /** + * Add read to this active region + * + * Read must have alignment start >= than the last read currently in this active region. + * + * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region + * + * @param read a non-null GATKSAMRecord + */ + @Ensures("reads.size() == old(reads.size()) + 1") + public void add( final GATKSAMRecord read ) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + if ( ! readOverlapsRegion(read) ) + throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc); + + spanIncludingReads = spanIncludingReads.union( readLoc ); + + if ( ! reads.isEmpty() ) { + final GATKSAMRecord lastRead = reads.get(size() - 1); + if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read); + + if ( read.getAlignmentStart() < lastRead.getAlignmentStart() ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart()); + } + + reads.add( read ); + } + + /** + * Returns true if read would overlap the extended extent of this region + * @param read the read we want to test + * @return true if read can be added to this region, false otherwise + */ + public boolean readOverlapsRegion(final GATKSAMRecord read) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + return readLoc.overlapsP(extendedLoc); + } + + /** + * Add all reads to this active region + * @param reads a collection of reads to add to this active region + */ + public void addAll(final Collection reads) { + if ( reads == null ) throw new IllegalArgumentException("reads cannot be null"); + for ( final GATKSAMRecord read : reads ) + add(read); + } + + /** + * Clear all of the reads currently in this active region + */ + @Ensures("size() == 0") + public void clearReads() { + spanIncludingReads = extendedLoc; + reads.clear(); + } + + /** + * Remove all of the reads in readsToRemove from this active region + * @param readsToRemove the set of reads we want to remove + */ + public void removeAll( final Set readsToRemove ) { + final Iterator it = reads.iterator(); + spanIncludingReads = extendedLoc; + while ( it.hasNext() ) { + final GATKSAMRecord read = it.next(); + if ( readsToRemove.contains(read) ) + it.remove(); + else + spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); + } + } + + /** + * Is this region equal to other, excluding any reads in either region in the comparison + * @param other the other active region we want to test + * @return true if this region is equal, excluding any reads and derived values, to other + */ + protected boolean equalExceptReads(final ActiveRegion other) { + if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; + if ( isActive() != other.isActive()) return false; + if ( genomeLocParser != other.genomeLocParser ) return false; + if ( extension != other.extension ) return false; + if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; + return true; + } + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + public boolean isActive() { + return isActive; + } + + /** + * Intersect this active region with the allowed intervals, returning a list of active regions + * that only contain locations present in intervals + * + * Note that the returned list may be empty, if this active region doesn't overlap the set at all + * + * Note that the resulting regions are all empty, regardless of whether the current active region has reads + * + * @param intervals a non-null set of intervals that are allowed + * @return an ordered list of active region where each interval is contained within intervals + */ + @Ensures("result != null") + protected List splitAndTrimToIntervals(final GenomeLocSortedSet intervals) { + final List allOverlapping = intervals.getOverlapping(getLocation()); + final List clippedRegions = new LinkedList(); + + for ( final GenomeLoc overlapping : allOverlapping ) { + clippedRegions.add(trim(overlapping, extension)); + } + + return clippedRegions; + } + + /** + * Trim this active to just the span, producing a new active region without any reads that has only + * the extent of newExtend intersected with the current extent + * @param span the new extend of the active region we want + * @param extension the extension size we want for the newly trimmed active region + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc span, final int extension) { + if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + if ( extension < 0) throw new IllegalArgumentException("the extension size must be 0 or greater"); + final int extendStart = Math.max(1,span.getStart() - extension); + final int maxStop = genomeLocParser.getContigs().getSequence(span.getContigIndex()).getSequenceLength(); + final int extendStop = Math.min(span.getStop() + extension, maxStop); + final GenomeLoc extendedSpan = genomeLocParser.createGenomeLoc(span.getContig(), extendStart, extendStop); + return trim(span, extendedSpan); + +//TODO - Inconsiste support of substates trimming. Check lack of consistency!!!! +// final GenomeLoc subLoc = getLocation().intersect(span); +// final int subStart = subLoc.getStart() - getLocation().getStart(); +// final int subEnd = subStart + subLoc.size(); +// final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); +// return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension ); + + } + + public ActiveRegion trim(final GenomeLoc span) { + return trim(span,span); + } + + /** + * Trim this active to no more than the span, producing a new active region with properly trimmed reads that + * attempts to provide the best possible representation of this active region covering the span. + * + * The challenge here is that span may (1) be larger than can be represented by this active region + * + its original extension and (2) the extension must be symmetric on both sides. This algorithm + * therefore determines how best to represent span as a subset of the span of this + * region with a padding value that captures as much of the span as possible. + * + * For example, suppose this active region is + * + * Active: 100-200 with extension of 50, so that the true span is 50-250 + * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 + * + * Here we represent the active region as a active region from 150-200 with 25 bp of padding. + * + * The overall constraint is that the active region can never exceed the original active region, and + * the extension is chosen to maximize overlap with the desired region + * + * @param span the new extend of the active region we want + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc span, final GenomeLoc extendedSpan) { + if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + if ( extendedSpan == null ) throw new IllegalArgumentException("Active region extended span cannot be null"); + if ( ! extendedSpan.containsP(span)) + throw new IllegalArgumentException("The requested extended must fully contain the requested span"); + + final GenomeLoc subActive = getLocation().intersect(span); + final int requiredOnRight = Math.max(extendedSpan.getStop() - subActive.getStop(), 0); + final int requiredOnLeft = Math.max(subActive.getStart() - extendedSpan.getStart(), 0); + final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); + + final ActiveRegion result = new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); + + final List myReads = getReads(); + final GenomeLoc resultExtendedLoc = result.getExtendedLoc(); + final int resultExtendedLocStart = resultExtendedLoc.getStart(); + final int resultExtendedLocStop = resultExtendedLoc.getStop(); + + final List trimmedReads = new ArrayList<>(myReads.size()); + for( final GATKSAMRecord read : myReads ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion(read, + resultExtendedLocStart, resultExtendedLocStop); + if( result.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) + trimmedReads.add(clippedRead); + } + result.clearReads(); + result.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); + return result; + } + + public void setFinalized(final boolean value) { + hasBeenFinalized = value; + } + + public boolean isFinalized() { + return hasBeenFinalized; + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java diff --git a/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcid.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcid.java diff --git a/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java diff --git a/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQ.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQ.java diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/JVMUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/JVMUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PluginManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PluginManager.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java new file mode 100644 index 000000000..fd04dbc21 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -0,0 +1,634 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.clipping; + +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Iterator; +import java.util.List; +import java.util.Stack; +import java.util.Vector; + +/** + * Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases + * of the read, plus an option extraInfo (useful for carrying info where needed). + *

+ * Also holds the critical apply function that actually execute the clipping operation on a provided read, + * according to the wishes of the supplied ClippingAlgorithm enum. + */ +public class ClippingOp { + public final int start, stop; // inclusive + + public ClippingOp(int start, int stop) { + this.start = start; + this.stop = stop; + } + + + public int getLength() { + return stop - start + 1; + } + + /** + * Clips the bases in read according to this operation's start and stop. Uses the clipping + * representation used is the one provided by algorithm argument. + * + * @param algorithm clipping algorithm to use + * @param originalRead the read to be clipped + */ + public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { + GATKSAMRecord read; + try { + read = (GATKSAMRecord) originalRead.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + byte[] quals = read.getBaseQualities(); + byte[] bases = read.getReadBases(); + byte[] newBases = new byte[bases.length]; + byte[] newQuals = new byte[quals.length]; + + switch (algorithm) { + // important note: + // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 + // because you're not guaranteed to get a pointer to the actual array of bytes in the GATKSAMRecord + case WRITE_NS: + for (int i = 0; i < bases.length; i++) { + if (i >= start && i <= stop) { + newBases[i] = 'N'; + } + else { + newBases[i] = bases[i]; + } + } + read.setReadBases(newBases); + break; + case WRITE_Q0S: + for (int i = 0; i < quals.length; i++) { + if (i >= start && i <= stop) { + newQuals[i] = 0; + } + else { + newQuals[i] = quals[i]; + } + } + read.setBaseQualities(newQuals); + break; + case WRITE_NS_Q0S: + for (int i = 0; i < bases.length; i++) { + if (i >= start && i <= stop) { + newQuals[i] = 0; + newBases[i] = 'N'; + } + else { + newQuals[i] = quals[i]; + newBases[i] = bases[i]; + } + } + read.setBaseQualities(newBases); + read.setReadBases(newBases); + break; + case HARDCLIP_BASES: + read = hardClip(read, start, stop); + break; + + case SOFTCLIP_BASES: + if (read.getReadUnmappedFlag()) { + // we can't process unmapped reads + throw new UserException("Read Clipper cannot soft clip unmapped reads"); + } + + //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); + int myStop = stop; + if ((stop + 1 - start) == read.getReadLength()) { + // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone + //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); + //break; + myStop--; // just decrement stop + } + + if (start > 0 && myStop != read.getReadLength() - 1) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); + + Cigar oldCigar = read.getCigar(); + + int scLeft = 0, scRight = read.getReadLength(); + if (start == 0) + scLeft = myStop + 1; + else + scRight = start; + + Cigar newCigar = softClip(oldCigar, scLeft, scRight); + read.setCigar(newCigar); + + int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); + int newStart = read.getAlignmentStart() + newClippedStart; + read.setAlignmentStart(newStart); + + break; + + case REVERT_SOFTCLIPPED_BASES: + read = revertSoftClippedBases(read); + break; + + default: + throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); + } + + return read; + } + + private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + GATKSAMRecord unclipped; + + // shallow copy of the read bases and quals should be fine here because they are immutable in the original read + try { + unclipped = (GATKSAMRecord) read.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + + Cigar unclippedCigar = new Cigar(); + int matchesCount = 0; + for (CigarElement element : read.getCigar().getCigarElements()) { + if (element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + matchesCount += element.getLength(); + else if (matchesCount > 0) { + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + matchesCount = 0; + unclippedCigar.add(element); + } else + unclippedCigar.add(element); + } + if (matchesCount > 0) + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + + unclipped.setCigar(unclippedCigar); + final int newStart = read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar); + unclipped.setAlignmentStart(newStart); + + if ( newStart <= 0 ) { + // if the start of the unclipped read occurs before the contig, + // we must hard clip away the bases since we cannot represent reads with + // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned) + return hardClip(unclipped, 0, - newStart); + } else { + return unclipped; + } + } + + /** + * Given a cigar string, get the number of bases hard or soft clipped at the start + */ + private int getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) { + int num = 0; + for (CigarElement e : __cigar.getCigarElements()) { + if (!e.getOperator().consumesReferenceBases()) { + if (e.getOperator().consumesReadBases()) { + num += e.getLength(); + } + } else { + break; + } + } + + int oldNum = 0; + int curReadCounter = 0; + + for (CigarElement e : __oldCigar.getCigarElements()) { + int curRefLength = e.getLength(); + int curReadLength = e.getLength(); + if (!e.getOperator().consumesReadBases()) { + curReadLength = 0; + } + + boolean truncated = false; + if (curReadCounter + curReadLength > num) { + curReadLength = num - curReadCounter; + curRefLength = num - curReadCounter; + truncated = true; + } + + if (!e.getOperator().consumesReferenceBases()) { + curRefLength = 0; + } + + curReadCounter += curReadLength; + oldNum += curRefLength; + + if (curReadCounter > num || truncated) { + break; + } + } + + return oldNum; + } + + /** + * Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin + */ + private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) { + if (__endClipBegin <= __startClipEnd) { + //whole thing should be soft clipped + int cigarLength = 0; + for (CigarElement e : __cigar.getCigarElements()) { + cigarLength += e.getLength(); + } + + Cigar newCigar = new Cigar(); + newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP)); + assert newCigar.isValid(null, -1) == null; + return newCigar; + } + + int curLength = 0; + Vector newElements = new Vector(); + for (CigarElement curElem : __cigar.getCigarElements()) { + if (!curElem.getOperator().consumesReadBases()) { + if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) { + newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); + } + continue; + } + + int s = curLength; + int e = curLength + curElem.getLength(); + if (e <= __startClipEnd || s >= __endClipBegin) { + //must turn this entire thing into a clip + newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP)); + } else if (s >= __startClipEnd && e <= __endClipBegin) { + //same thing + newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); + } else { + //we are clipping in the middle of this guy + CigarElement newStart = null; + CigarElement newMid = null; + CigarElement newEnd = null; + + int midLength = curElem.getLength(); + if (s < __startClipEnd) { + newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP); + midLength -= newStart.getLength(); + } + + if (e > __endClipBegin) { + newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP); + midLength -= newEnd.getLength(); + } + assert midLength >= 0; + if (midLength > 0) { + newMid = new CigarElement(midLength, curElem.getOperator()); + } + if (newStart != null) { + newElements.add(newStart); + } + if (newMid != null) { + newElements.add(newMid); + } + if (newEnd != null) { + newElements.add(newEnd); + } + } + curLength += curElem.getLength(); + } + + Vector finalNewElements = new Vector(); + CigarElement lastElement = null; + for (CigarElement elem : newElements) { + if (lastElement == null || lastElement.getOperator() != elem.getOperator()) { + if (lastElement != null) { + finalNewElements.add(lastElement); + } + lastElement = elem; + } else { + lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator()); + } + } + if (lastElement != null) { + finalNewElements.add(lastElement); + } + + Cigar newCigar = new Cigar(finalNewElements); + assert newCigar.isValid(null, -1) == null; + return newCigar; + } + + /** + * Hard clip bases from read, from start to stop in base coordinates + * + * If start == 0, then we will clip from the front of the read, otherwise we clip + * from the right. If start == 0 and stop == 10, this would clip out the first + * 10 bases of the read. + * + * Note that this function works with reads with negative alignment starts, in order to + * allow us to hardClip reads that have had their soft clips reverted and so might have + * negative alignment starts + * + * Works properly with reduced reads and insertion/deletion base qualities + * + * @param read a non-null read + * @param start a start >= 0 and < read.length + * @param stop a stop >= 0 and < read.length. + * @return a cloned version of read that has been properly trimmed down + */ + private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { + + // If the read is unmapped there is no Cigar string and neither should we create a new cigar string + final CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); + + // the cigar may force a shift left or right (or both) in case we are left with insertions + // starting or ending the read after applying the hard clip on start/stop. + final int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; + final byte[] newBases = new byte[newLength]; + final byte[] newQuals = new byte[newLength]; + final int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; + + System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); + System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); + + final GATKSAMRecord hardClippedRead; + try { + hardClippedRead = (GATKSAMRecord) read.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + + hardClippedRead.resetSoftStartAndEnd(); // reset the cached soft start and end because they may have changed now that the read was hard clipped. No need to calculate them now. They'll be lazily calculated on the next call to getSoftStart()/End() + hardClippedRead.setBaseQualities(newQuals); + hardClippedRead.setReadBases(newBases); + hardClippedRead.setCigar(cigarShift.cigar); + if (start == 0) + hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); + + if (read.hasBaseIndelQualities()) { + final byte[] newBaseInsertionQuals = new byte[newLength]; + final byte[] newBaseDeletionQuals = new byte[newLength]; + System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); + System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); + hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); + hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); + } + + return hardClippedRead; + + } + + @Requires({"!cigar.isEmpty()"}) + private CigarShift hardClipCigar(Cigar cigar, int start, int stop) { + Cigar newCigar = new Cigar(); + int index = 0; + int totalHardClipCount = stop - start + 1; + int alignmentShift = 0; // caused by hard clipping deletions + + // hard clip the beginning of the cigar string + if (start == 0) { + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + CigarElement cigarElement = cigarElementIterator.next(); + // Skip all leading hard clips + while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + totalHardClipCount += cigarElement.getLength(); + if (cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + else + throw new ReviewedStingException("Read is entirely hardclipped, shouldn't be trying to clip it's cigar string"); + } + // keep clipping until we hit stop + while (index <= stop) { + int shift = 0; + if (cigarElement.getOperator().consumesReadBases()) + shift = cigarElement.getLength(); + + // we're still clipping or just finished perfectly + if (index + shift == stop + 1) { + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); + } + // element goes beyond what we need to clip + else if (index + shift > stop + 1) { + int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1); + newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); + newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); + } + index += shift; + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, shift); + + if (index <= stop && cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + else + break; + } + + // add the remaining cigar elements + while (cigarElementIterator.hasNext()) { + cigarElement = cigarElementIterator.next(); + newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); + } + } + + // hard clip the end of the cigar string + else { + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + CigarElement cigarElement = cigarElementIterator.next(); + + // Keep marching on until we find the start + while (index < start) { + int shift = 0; + if (cigarElement.getOperator().consumesReadBases()) + shift = cigarElement.getLength(); + + // we haven't gotten to the start yet, keep everything as is. + if (index + shift < start) + newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); + + // element goes beyond our clip starting position + else { + int elementLengthAfterChopping = start - index; + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index)); + + // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += elementLengthAfterChopping; + // otherwise, maintain what's left of this last operator + else + newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); + } + index += shift; + if (index < start && cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + else + break; + } + + // check if we are hard clipping indels + while (cigarElementIterator.hasNext()) { + cigarElement = cigarElementIterator.next(); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + + // if the read had a HardClip operator in the end, combine it with the Hard Clip we are adding + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += cigarElement.getLength(); + } + newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); + } + return cleanHardClippedCigar(newCigar); + } + + /** + * Checks if a hard clipped cigar left a read starting or ending with deletions or gap (N) + * and cleans it up accordingly. + * + * @param cigar the original cigar + * @return an object with the shifts (see CigarShift class) + */ + private CigarShift cleanHardClippedCigar(final Cigar cigar) { + final Cigar cleanCigar = new Cigar(); + int shiftFromStart = 0; + int shiftFromEnd = 0; + Stack cigarStack = new Stack(); + final Stack inverseCigarStack = new Stack(); + + for (final CigarElement cigarElement : cigar.getCigarElements()) + cigarStack.push(cigarElement); + + for (int i = 1; i <= 2; i++) { + int shift = 0; + int totalHardClip = 0; + boolean readHasStarted = false; + boolean addedHardClips = false; + + while (!cigarStack.empty()) { + CigarElement cigarElement = cigarStack.pop(); + + if (!readHasStarted && + cigarElement.getOperator() != CigarOperator.DELETION && + cigarElement.getOperator() != CigarOperator.SKIPPED_REGION && + cigarElement.getOperator() != CigarOperator.HARD_CLIP) + readHasStarted = true; + + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClip += cigarElement.getLength(); + + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) + totalHardClip += cigarElement.getLength(); + + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) + totalHardClip += cigarElement.getLength(); + + if (readHasStarted) { + if (i == 1) { + if (!addedHardClips) { + if (totalHardClip > 0) + inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); + addedHardClips = true; + } + inverseCigarStack.push(cigarElement); + } else { + if (!addedHardClips) { + if (totalHardClip > 0) + cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); + addedHardClips = true; + } + cleanCigar.add(cigarElement); + } + } + } + // first pass (i=1) is from end to start of the cigar elements + if (i == 1) { + shiftFromEnd = shift; + cigarStack = inverseCigarStack; + } + // second pass (i=2) is from start to end with the end already cleaned + else { + shiftFromStart = shift; + } + } + return new CigarShift(cleanCigar, shiftFromStart, shiftFromEnd); + } + + /** + * Compute the offset of the first "real" position in the cigar on the genome + * + * This is defined as a first position after a run of Hs followed by a run of Ss + * + * @param cigar A non-null cigar + * @return the offset (from 0) of the first on-genome base + */ + private int calcHardSoftOffset(final Cigar cigar) { + final List elements = cigar.getCigarElements(); + + int size = 0; + int i = 0; + while ( i < elements.size() && elements.get(i).getOperator() == CigarOperator.HARD_CLIP ) { + size += elements.get(i).getLength(); + i++; + } + while ( i < elements.size() && elements.get(i).getOperator() == CigarOperator.SOFT_CLIP ) { + size += elements.get(i).getLength(); + i++; + } + + return size; + } + + private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { + final int newShift = calcHardSoftOffset(newCigar); + final int oldShift = calcHardSoftOffset(oldCigar); + return newShift - oldShift; + } + + private int calculateHardClippingAlignmentShift(CigarElement cigarElement, int clippedLength) { + // Insertions should be discounted from the total hard clip count + if (cigarElement.getOperator() == CigarOperator.INSERTION) + return -clippedLength; + + // Deletions and Ns should be added to the total hard clip count (because we want to maintain the original alignment start) + else if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) + return cigarElement.getLength(); + + // There is no shift if we are not clipping an indel + return 0; + } + + private static class CigarShift { + private Cigar cigar; + private int shiftFromStart; + private int shiftFromEnd; + + private CigarShift(Cigar cigar, int shiftFromStart, int shiftFromEnd) { + this.cigar = cigar; + this.shiftFromStart = shiftFromStart; + this.shiftFromEnd = shiftFromEnd; + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ReadClipper.java new file mode 100644 index 000000000..1b72503fa --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -0,0 +1,568 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.clipping; + +import com.google.java.contract.Requires; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * A comprehensive clipping tool. + * + * General Contract: + * - All clipping operations return a new read with the clipped bases requested, it never modifies the original read. + * - If a read is fully clipped, return an empty GATKSAMRecord, never null. + * - When hard clipping, add cigar operator H for every *reference base* removed (i.e. Matches, SoftClips and Deletions, but *not* insertions). See Hard Clipping notes for details. + * + * + * There are several types of clipping to use: + * + * Write N's: + * Change the bases to N's in the desired region. This can be applied anywhere in the read. + * + * Write Q0's: + * Change the quality of the bases in the desired region to Q0. This can be applied anywhere in the read. + * + * Write both N's and Q0's: + * Same as the two independent operations, put together. + * + * Soft Clipping: + * Do not change the read, just mark the reads as soft clipped in the Cigar String + * and adjust the alignment start and end of the read. + * + * Hard Clipping: + * Creates a new read without the hard clipped bases (and base qualities). The cigar string + * will be updated with the cigar operator H for every reference base removed (i.e. Matches, + * Soft clipped bases and deletions, but *not* insertions). This contract with the cigar + * is necessary to allow read.getUnclippedStart() / End() to recover the original alignment + * of the read (before clipping). + * + */ +public class ReadClipper { + final GATKSAMRecord read; + boolean wasClipped; + List ops = null; + + /** + * Initializes a ReadClipper object. + * + * You can set up your clipping operations using the addOp method. When you're ready to + * generate a new read with all the clipping operations, use clipRead(). + * + * Note: Use this if you want to set up multiple operations on the read using the ClippingOp + * class. If you just want to apply one of the typical modes of clipping, use the static + * clipping functions available in this class instead. + * + * @param read the read to clip + */ + public ReadClipper(final GATKSAMRecord read) { + this.read = read; + this.wasClipped = false; + } + + /** + * Add clipping operation to the read. + * + * You can add as many operations as necessary to this read before clipping. Beware that the + * order in which you add these operations matter. For example, if you hard clip the beginning + * of a read first then try to hard clip the end, the indices will have changed. Make sure you + * know what you're doing, otherwise just use the static functions below that take care of the + * ordering for you. + * + * Note: You only choose the clipping mode when you use clipRead() + * + * @param op a ClippingOp object describing the area you want to clip. + */ + public void addOp(ClippingOp op) { + if (ops == null) ops = new ArrayList(); + ops.add(op); + } + + /** + * Check the list of operations set up for this read. + * + * @return a list of the operations set up for this read. + */ + public List getOps() { + return ops; + } + + /** + * Check whether or not this read has been clipped. + * @return true if this read has produced a clipped read, false otherwise. + */ + public boolean wasClipped() { + return wasClipped; + } + + /** + * The original read. + * + * @return returns the read to be clipped (original) + */ + public GATKSAMRecord getRead() { + return read; + } + + /** + * Clips a read according to ops and the chosen algorithm. + * + * @param algorithm What mode of clipping do you want to apply for the stacked operations. + * @return the read with the clipping applied. + */ + public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { + if (ops == null) + return getRead(); + + GATKSAMRecord clippedRead = read; + for (ClippingOp op : getOps()) { + final int readLength = clippedRead.getReadLength(); + //check if the clipped read can still be clipped in the range requested + if (op.start < readLength) { + ClippingOp fixedOperation = op; + if (op.stop >= readLength) + fixedOperation = new ClippingOp(op.start, readLength - 1); + + clippedRead = fixedOperation.apply(algorithm, clippedRead); + } + } + wasClipped = true; + ops.clear(); + if ( clippedRead.isEmpty() ) + return GATKSAMRecord.emptyRead(clippedRead); + return clippedRead; + } + + + /** + * Hard clips the left tail of a read up to (and including) refStop using reference + * coordinates. + * + * @param refStop the last base to be hard clipped in the left tail of the read. + * @return a new read, without the left tail. + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + private GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { + return hardClipByReferenceCoordinates(-1, refStop); + } + public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); + } + + + + /** + * Hard clips the right tail of a read starting at (and including) refStart using reference + * coordinates. + * + * @param refStart refStop the first base to be hard clipped in the right tail of the read. + * @return a new read, without the right tail. + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + private GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) { + return hardClipByReferenceCoordinates(refStart, -1); + } + public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); + } + + /** + * Hard clips a read using read coordinates. + * + * @param start the first base to clip (inclusive) + * @param stop the last base to clip (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read + "start == 0 || stop == read.getReadLength() - 1"}) // cannot clip the middle of the read + private GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { + if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) + return GATKSAMRecord.emptyRead(read); + + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { + return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); + } + + + /** + * Hard clips both tails of a read. + * Left tail goes from the beginning to the 'left' coordinate (inclusive) + * Right tail goes from the 'right' coordinate (inclusive) until the end of the read + * + * @param left the coordinate of the last base to be clipped in the left tail (inclusive) + * @param right the coordinate of the first base to be clipped in the right tail (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"left <= right", // tails cannot overlap + "left >= read.getAlignmentStart()", // coordinate has to be within the mapped read + "right <= read.getAlignmentEnd()"}) // coordinate has to be within the mapped read + private GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { + if (read.isEmpty() || left == right) + return GATKSAMRecord.emptyRead(read); + GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); + + // after clipping one tail, it is possible that the consequent hard clipping of adjacent deletions + // make the left cut index no longer part of the read. In that case, clip the read entirely. + if (left > leftTailRead.getAlignmentEnd()) + return GATKSAMRecord.emptyRead(read); + + ReadClipper clipper = new ReadClipper(leftTailRead); + return clipper.hardClipByReferenceCoordinatesLeftTail(left); + } + public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { + return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); + } + + + /** + * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the desired algorithm. + * + * This function will look for low quality tails and hard clip them away. A low quality tail + * ends when a base has base quality greater than lowQual. + * + * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...) + * @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped + * @return a new read without low quality tails + */ + private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) { + if (read.isEmpty()) + return read; + + final byte [] quals = read.getBaseQualities(); + final int readLength = read.getReadLength(); + int leftClipIndex = 0; + int rightClipIndex = readLength - 1; + + // check how far we can clip both sides + while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; + while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++; + + // if the entire read should be clipped, then return an empty read. + if (leftClipIndex > rightClipIndex) + return GATKSAMRecord.emptyRead(read); + + if (rightClipIndex < readLength - 1) { + this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1)); + } + if (leftClipIndex > 0 ) { + this.addOp(new ClippingOp(0, leftClipIndex - 1)); + } + return this.clipRead(algorithm); + } + + private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { + return this.clipLowQualEnds(ClippingRepresentation.HARDCLIP_BASES, lowQual); + } + public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { + return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); + } + public static GATKSAMRecord clipLowQualEnds(GATKSAMRecord read, byte lowQual, ClippingRepresentation algorithm) { + return (new ReadClipper(read)).clipLowQualEnds(algorithm, lowQual); + } + + + /** + * Will hard clip every soft clipped bases in the read. + * + * @return a new read without the soft clipped bases + */ + private GATKSAMRecord hardClipSoftClippedBases () { + if (read.isEmpty()) + return read; + + int readIndex = 0; + int cutLeft = -1; // first position to hard clip (inclusive) + int cutRight = -1; // first position to hard clip (inclusive) + boolean rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail + + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (rightTail) { + cutRight = readIndex; + } + else { + cutLeft = readIndex + cigarElement.getLength() - 1; + } + } + else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) + rightTail = true; + + if (cigarElement.getOperator().consumesReadBases()) + readIndex += cigarElement.getLength(); + } + + // It is extremely important that we cut the end first otherwise the read coordinates change. + if (cutRight >= 0) + this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1)); + if (cutLeft >= 0) + this.addOp(new ClippingOp(0, cutLeft)); + + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipSoftClippedBases(); + } + + + /** + * Hard clip the read to the variable region (from refStart to refStop) + * + * @param read the read to be clipped + * @param refStart the beginning of the variant region (inclusive) + * @param refStop the end of the variant region (inclusive) + * @return the read hard clipped to the variant region + */ + public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop ) { + final int start = read.getAlignmentStart(); + final int stop = read.getAlignmentEnd(); + return hardClipToRegion(read, refStart, refStop,start,stop); + } + + /** + * Hard clip the read to the variable region (from refStart to refStop) processing also the clipped bases + * + * @param read the read to be clipped + * @param refStart the beginning of the variant region (inclusive) + * @param refStop the end of the variant region (inclusive) + * @return the read hard clipped to the variant region + */ + public static GATKSAMRecord hardClipToRegionIncludingClippedBases( final GATKSAMRecord read, final int refStart, final int refStop ) { + final int start = read.getOriginalAlignmentStart(); + final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read,0,read.getCigarLength()) - 1; + return hardClipToRegion(read, refStart, refStop,start,stop); + } + + private static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop, final int alignmentStart, final int alignmentStop){ + // check if the read is contained in region + if (alignmentStart <= refStop && alignmentStop >= refStart) { + if (alignmentStart < refStart && alignmentStop > refStop) + return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1); + else if (alignmentStart < refStart) + return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1); + else if (alignmentStop > refStop) + return hardClipByReferenceCoordinatesRightTail(read, refStop + 1); + return read; + } else + return GATKSAMRecord.emptyRead(read); + + } + + public static List hardClipToRegion( final List reads, final int refStart, final int refStop ) { + final List returnList = new ArrayList( reads.size() ); + for( final GATKSAMRecord read : reads ) { + final GATKSAMRecord clippedRead = hardClipToRegion( read, refStart, refStop ); + if( !clippedRead.isEmpty() ) { + returnList.add( clippedRead ); + } + } + return returnList; + } + + /** + * Checks if a read contains adaptor sequences. If it does, hard clips them out. + * + * Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary() + * + * @return a new read without adaptor sequence + */ + private GATKSAMRecord hardClipAdaptorSequence () { + final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read); + + if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary)) + return read; + + return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); + } + public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipAdaptorSequence(); + } + + + /** + * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the end. + * + * @return a new read without leading insertions + */ + private GATKSAMRecord hardClipLeadingInsertions() { + if (read.isEmpty()) + return read; + + for(CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && + cigarElement.getOperator() != CigarOperator.INSERTION) + break; + + else if (cigarElement.getOperator() == CigarOperator.INSERTION) + this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); + + } + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipLeadingInsertions(); + } + + + /** + * Turns soft clipped bases into matches + * @return a new read with every soft clip turned into a match + */ + private GATKSAMRecord revertSoftClippedBases() { + if (read.isEmpty()) + return read; + + this.addOp(new ClippingOp(0, 0)); + return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); + } + + /** + * Reverts ALL soft-clipped bases + * + * @param read the read + * @return the read with all soft-clipped bases turned into matches + */ + public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + return (new ReadClipper(read)).revertSoftClippedBases(); + } + + /** + * Reverts only soft clipped bases with quality score greater than or equal to minQual + * + * todo -- Note: Will write a temporary field with the number of soft clips that were undone on each side (left: 'SL', right: 'SR') -- THIS HAS BEEN REMOVED TEMPORARILY SHOULD HAPPEN INSIDE THE CLIPPING ROUTINE! + * + * @param read the read + * @param minQual the mininum base quality score to revert the base (inclusive) + * @return a new read with high quality soft clips reverted + */ + public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read, byte minQual) { + return revertSoftClippedBases(hardClipLowQualitySoftClips(read, minQual)); + } + + /** + * Hard clips away soft clipped bases that are below the given quality threshold + * + * @param read the read + * @param minQual the mininum base quality score to revert the base (inclusive) + * @return a new read without low quality soft clipped bases + */ + public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) { + int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart(); + if (read.isEmpty() || nLeadingSoftClips > read.getReadLength()) + return GATKSAMRecord.emptyRead(read); + + byte [] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + int left = -1; + + if (nLeadingSoftClips > 0) { + for (int i = nLeadingSoftClips - 1; i >= 0; i--) { + if (quals[i] >= minQual) + left = i; + else + break; + } + } + + int right = -1; + int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd(); + if (nTailingSoftClips > 0) { + for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength() ; i++) { + if (quals[i] >= minQual) + right = i; + else + break; + } + } + + GATKSAMRecord clippedRead = read; + if (right >= 0 && right + 1 < clippedRead.getReadLength()) // only clip if there are softclipped bases (right >= 0) and the first high quality soft clip is not the last base (right+1 < readlength) + clippedRead = hardClipByReadCoordinates(clippedRead, right+1, clippedRead.getReadLength()-1); // first we hard clip the low quality soft clips on the right tail + if (left >= 0 && left - 1 > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality soft clip is not the last base (left-1 > 0) + clippedRead = hardClipByReadCoordinates(clippedRead, 0, left-1); // then we hard clip the low quality soft clips on the left tail + + return clippedRead; + } + + /** + * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail + * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. + * + * Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the + * left of right tail) by specifying either refStart < 0 or refStop < 0. + * + * @param refStart first base to clip (inclusive) + * @param refStop last base to clip (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip + protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + if (read.isEmpty()) + return read; + + int start; + int stop; + + // Determine the read coordinate to start and stop hard clipping + if (refStart < 0) { + if (refStop < 0) + throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); + start = 0; + stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + } + else { + if (refStop >= 0) + throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); + start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); + stop = read.getReadLength() - 1; + } + + if (start < 0 || stop > read.getReadLength() - 1) + throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); + + if ( start > stop ) + throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)", start, stop, read, read.getCigarString())); + + if ( start > 0 && stop < read.getReadLength() - 1) + throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); + + this.addOp(new ClippingOp(start, stop)); + GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); + this.ops = null; + return clippedRead; + } + + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java new file mode 100644 index 000000000..70241a6c4 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -0,0 +1,354 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.codecs.sampileup; + +import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.util.ParsingUtils; + +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; + +/** + * Decoder for SAM pileup data. + * + *

+ * From the SAMTools project documentation: + *

+ *

The Pileup format was first used by Tony Cox and Zemin Ning at + * the Sanger Institute. It describes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced + * in Samtools by mpileup, which produces a slightly different output format by default. + *

+ + *

Format

+ *

There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old + * 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated.

+ *

Simple pileup: 6-column format

+ *

+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

+ *
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * 
+ *

+ * See the Pileup format documentation for more details. + *

+ * + *

Consensus pileup: 10/13-column format

+ *

The "consensus" or extended pileup consists of the following: + *

    + *
  • original 6 columns as described above
  • + *
  • 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the + * reads covering the sites) for all sites, inserted before the bases and quality strings
  • + *
  • 3 extra columns indicating counts of reads supporting indels (just for indel sites)
  • + *
+ *

+ *

Example of consensus pileup for SNP or non-variant sites

+ *
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * 
+ * + *

Example of consensus pileup for indels

+ *
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * 
+ *

+ * See Consensus pileup format (deprecated) for more details. + *

+ * + *

Caveat

+ *

Handling of indels is questionable at the moment. Proceed with care.

+ * + * + * @author Matt Hanna, Geraldine VdAuwera + * @since 2014 + */ +public class SAMPileupCodec extends AsciiFeatureCodec { + // number of tokens expected (6 or 10 are valid, anything else is wrong) + private static final int basicTokenCount = 6; + private static final int consensusSNPTokenCount = 10; + private static final int consensusIndelTokenCount = 13; + private static final char fldDelim = '\t'; + // allocate once and don't ever bother creating them again: + private static final String baseA = "A"; + private static final String baseC = "C"; + private static final String baseG = "G"; + private static final String baseT = "T"; + private static final String emptyStr = ""; // we will use this for "reference" allele in insertions + + public SAMPileupCodec() { + super(SAMPileupFeature.class); + } + + public SAMPileupFeature decode(String line) { + //+1 because we want to know if we have more than the max + String[] tokens = new String[consensusIndelTokenCount+1]; + + // split the line + final int count = ParsingUtils.split(line,tokens,fldDelim); + + SAMPileupFeature feature = new SAMPileupFeature(); + + /** + * Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation. + * The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at + * the end for indels. We are currently not making any use of the extra indel tokens. + * + * Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong. + */ + final String observedString, bases, quals; + + feature.setChr(tokens[0]); + feature.setStart(Integer.parseInt(tokens[1])); + + if(tokens[2].length() != 1) { + throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line); + } + feature.setRef(tokens[2].charAt(0)); + + switch (count) { + case basicTokenCount: + bases = tokens[4]; + quals = tokens[5]; + // parsing is pretty straightforward for 6-col format + if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format + throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format"); + } else { + parseBasesAndQuals(feature, bases, quals); + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + } + break; + case consensusSNPTokenCount: // pileup called a SNP or a reference base + observedString = tokens[3].toUpperCase(); + feature.setFWDAlleles(new ArrayList(2)); + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + bases = tokens[8]; + quals = tokens[9]; + // confirm that we have a non-variant, not a mis-parsed indel + if ( feature.getRef() == '*' ) { + throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel"); + } + // Parse the SNP or non-variant + parseBasesAndQuals(feature, bases, quals); + if ( observedString.length() != 1 ) { + throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString); + } + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + + char ch = observedString.charAt(0); + + switch ( ch ) { // record alleles (decompose ambiguous base codes) + case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; + case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; + case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; + case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; + case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; + case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; + case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; + case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; + case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; + case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; + } + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); + else { + // we know that at least one allele is non-ref; + // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. + // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) + feature.setVariantType(VariantType.SNP); + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) + ) feature.setNumNonRef(1); + else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... + } + break; + case consensusIndelTokenCount: + observedString = tokens[3].toUpperCase(); + feature.setFWDAlleles(new ArrayList(2)); + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + // confirm that we have an indel, not a mis-parsed SNP or non-variant + if ( feature.getRef() != '*' ) { + throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'"); + } + // Parse the indel + parseIndels(observedString,feature) ; + if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); + else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!! + break; + default: + throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " + + "(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount + + " (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount + + " (consensus pileup for an indel); saw = " + count + " on line = " + line + ")"); + } + return feature; + } + + @Override + public Object readActualHeader(LineIterator lineIterator) { + // No header for this format + return null; + } + + private void parseIndels(String genotype,SAMPileupFeature feature) { + String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit + + // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; + // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases + // from what we have recorded for the deletion allele (e.g. "-CAC") + boolean hasRefAllele = false; + + for ( int i = 0 ; i < obs.length ; i++ ) { + if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { + hasRefAllele = true; + feature.getFWDAlleles().add(emptyStr); + continue; + } + + String varBases = obs[i].toUpperCase(); + + switch ( obs[i].charAt(0) ) { + case '+': + if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL); + else feature.setVariantType(VariantType.INSERTION); + feature.setRefBases(emptyStr); + break; + case '-' : + if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL); + else feature.setVariantType(VariantType.DELETION); + feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele" + break; + default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype); + } + feature.getFWDAlleles().add(varBases); + feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!! + } + if ( hasRefAllele ) { + // we got at least one ref. allele (out of two recorded) + if (feature.isReference()) { // both top theories are actually ref allele; + feature.setNumNonRef(0); // no observations of non-reference allele at all + feature.setRefBases(emptyStr); + } else { + feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left + } + } else { + // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we + // fail to set data in a consistent way. + if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1); + else feature.setNumNonRef(2); + } + // DONE with indels + + } + + private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals) + { + //System.out.printf("%s%n%s%n", bases, quals); + + // needs to convert the base string with its . and , to the ref base + StringBuilder baseBuilder = new StringBuilder(); + StringBuilder qualBuilder = new StringBuilder(); + boolean done = false; + for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { + //System.out.printf("%d %d%n", i, j); + char c = (char)bases.charAt(i); + + switch ( c ) { + case '.': // matches reference + case ',': // matches reference + baseBuilder.append(feature.getRef()); + qualBuilder.append(quals.charAt(j++)); + break; + case '$': // end of read + break; + case '*': // end of indel? + j++; + break; + case '^': // mapping quality + i++; + break; + case '+': // start of indel + case '-': // start of indel + final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 + final String rest = bases.substring(i+1); + //System.out.printf("sub is %s%n", rest); + Matcher match = regex.matcher(rest); + if ( ! match.matches() ) { + if ( feature.getRef() != '*' ) + throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i); + done = true; + } + else { + String g = match.group(1); + //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); + int l = Integer.parseInt(g); + i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) + //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); + } + break; + default: // non reference base + baseBuilder.append(c); + qualBuilder.append(quals.charAt(j++)); + } + } + + feature.setPileupBases(baseBuilder.toString()); + feature.setPileupQuals(qualBuilder.toString()); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java new file mode 100644 index 000000000..287363601 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java @@ -0,0 +1,276 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.codecs.sampileup; + +import net.sf.samtools.util.StringUtil; +import org.broad.tribble.Feature; + +import java.util.List; + +/** + * A tribble feature representing a SAM pileup. + * + * Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will + * contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as + * input for analysis that requires that information. + * + * @author mhanna + * @version 0.1 + */ +public class SAMPileupFeature implements Feature { + public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL }; + + private String contig; // genomic location of this genotyped site + private int start; + private int stop; + + private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) + private String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions + + private String pileupQuals; // the read base qualities + private String pileupBases; // the read bases themselves + + private List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion + private VariantType varType = VariantType.NONE; + private int nNonref = 0; // number of non-reference alleles observed + private int eventLength = 0; // number of inserted or deleted bases + + private double consensusScore = 0; + private double variantScore = 0; + + /** + * create the pileup feature. Default protection so that only other classes in this package can create it. + */ + SAMPileupFeature() {} + + public String getChr() { + return contig; + } + + protected void setChr(String chr) { + this.contig = chr; + } + + public int getStart() { + return start; + } + + protected void setStart(int start) { + this.start = start; + } + + public int getEnd() { + return stop; + } + + protected void setEnd(int end) { + this.stop = end; + } + + public String getQualsAsString() { return pileupQuals; } + + protected void setPileupQuals(String pileupQuals) { + this.pileupQuals = pileupQuals; + } + + /** Returns reference base for point genotypes or '*' for indel genotypes, as a char. + * + */ + public char getRef() { return refBaseChar; } + + protected void setRef(char ref) { + this.refBaseChar = ref; + } + + public int size() { return pileupQuals.length(); } + + /** Returns pile of observed bases over the current genomic location. + * + */ + public String getBasesAsString() { return pileupBases; } + + protected void setPileupBases(String pileupBases) { + this.pileupBases = pileupBases; + } + + /** Returns formatted pileup string for the current genomic location as + * "location: reference_base observed_base_pile observed_qual_pile" + */ + public String getPileupString() + { + if(start == stop) + return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString()); + else + return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString()); + } + + /** + * Gets the bases in byte array form. + * @return byte array of the available bases. + */ + public byte[] getBases() { + return StringUtil.stringToBytes(getBasesAsString()); + } + + /** + * Gets the Phred base qualities without ASCII offset. + * @return Phred base qualities. + */ + public byte[] getQuals() { + byte[] quals = StringUtil.stringToBytes(getQualsAsString()); + for(int i = 0; i < quals.length; i++) quals[i] -= 33; + return quals; + } + + /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single + * character (reference base). For indel genotypes, the string is empty for insertions into + * the reference, or consists of deleted bases for deletions. + * + * @return reference allele, forward strand + */ + public String getFWDRefBases() { + return refBases; + } + + protected void setRefBases(String refBases) { + this.refBases = refBases; + } + + public List getFWDAlleles() { + return observedAlleles; + } + + protected void setFWDAlleles(List alleles) { + this.observedAlleles = alleles; + } + + // ---------------------------------------------------------------------- + // + // What kind of variant are we? + // + // ---------------------------------------------------------------------- + public boolean isSNP() { return varType == VariantType.SNP; } + public boolean isInsertion() { return varType == VariantType.INSERTION; } + public boolean isDeletion() { return varType == VariantType.DELETION ; } + public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; } + public boolean isReference() { return varType == VariantType.NONE; } + + protected void setVariantType(VariantType variantType) { + this.varType = variantType; + } + + public boolean isHom() { + // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype + if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) ); + return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); + } + + public boolean isHet() { + // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype + if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) ); + return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); + } + + public double getVariantConfidence() { + return variantScore; + } + + protected void setVariantConfidence(double variantScore) { + this.variantScore = variantScore; + } + + public boolean isBiallelic() { + return nNonref < 2; + } + + protected void setNumNonRef(int nNonref) { + this.nNonref = nNonref; + } + + public double getConsensusConfidence() { + return consensusScore; + } + + protected void setConsensusConfidence(double consensusScore) { + this.consensusScore = consensusScore; + } + + public int length() { + return eventLength; + } + + protected void setLength(int eventLength) { + this.eventLength = eventLength; + } + + public boolean isIndelGenotype() { + return refBaseChar == '*'; + } + + + public boolean isPointGenotype() { + return ! isIndelGenotype(); + } + + /** Implements method required by GenotypeList interface. If this object represents + * an indel genotype, then it returns itself through this method. If this object is a + * point genotype, this method returns null. + * @return + */ + public SAMPileupFeature getIndelGenotype() { + if ( isIndelGenotype() ) return this; + else return null; + } + + /** Implements method required by GenotypeList interface. If this object represents + * a point genotype, then it returns itself through this method. If this object is an + * indel genotype, this method returns null. + * @return + */ + public SAMPileupFeature getPointGenotype() { + if ( isPointGenotype() ) return this; + else return null; + } + + /** Returns true if this object \em is an indel genotype (and thus + * indel genotype is what it only has). + * @return + */ + public boolean hasIndelGenotype() { + return isIndelGenotype(); + } + + /** Returns true if this object \em is a point genotype (and thus + * point genotype is what it only has. + * @return + */ + public boolean hasPointGenotype() { + return isPointGenotype(); + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/DefaultHashMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/DefaultHashMap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/DefaultHashMap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/DefaultHashMap.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/Pair.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/Pair.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/Pair.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/Pair.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/PrimitivePair.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/PrimitivePair.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/RODMergingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/RODMergingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/CryptUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/CryptUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/GATKKey.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/GATKKey.java diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DupUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DupUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java new file mode 100644 index 000000000..0f1b473c3 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.exceptions; + +import java.lang.reflect.InvocationTargetException; + +/** + * Class for handling common failures of dynamic class resolution + */ +public class DynamicClassResolutionException extends UserException { + public DynamicClassResolutionException(Class c, Exception ex) { + super(String.format("Could not create module %s because %s caused by exception %s", + c.getSimpleName(), moreInfo(ex), ex.getMessage())); + } + + private static String moreInfo(Exception ex) { + try { + throw ex; + } catch (InstantiationException e) { + return "BUG: cannot instantiate class: must be concrete class"; + } catch (NoSuchMethodException e) { + return "BUG: Cannot find expected constructor for class"; + } catch (IllegalAccessException e) { + return "Cannot instantiate class (Illegal Access)"; + } catch (InvocationTargetException e) { + return "Cannot instantiate class (Invocation failure)"; + } catch ( Exception e ) { + return String.format("an exception of type %s occurred",e.getClass().getSimpleName()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/StingException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/StingException.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/UserException.java new file mode 100644 index 000000000..4db6e3d69 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -0,0 +1,485 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.exceptions; + +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.File; + +/** + * Represents the common user errors detected by Sting / GATK + * + * Root class for all GATK user errors, as well as the container for errors themselves + */ +@DocumentedGATKFeature( + groupName = HelpConstants.DOCS_CAT_USRERR, + summary = "Errors caused by incorrect user behavior, such as bad files, bad arguments, etc." ) +public class UserException extends ReviewedStingException { + /** + * The URL where people can get help messages. Printed when an error occurs + */ + public static final String PHONE_HOME_DOCS_URL = "http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest"; + + public UserException(String msg) { super(msg); } + public UserException(String msg, Throwable e) { super(msg, e); } + private UserException(Throwable e) { super("", e); } // cannot be called, private access + + protected static String getMessage(Throwable t) { + String message = t.getMessage(); + return message != null ? message : t.getClass().getName(); + } + + public static class CommandLineException extends UserException { + public CommandLineException(String message) { + super(String.format("Invalid command line: %s", message)); + } + } + + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class IncompatibleReadFiltersException extends CommandLineException { + public IncompatibleReadFiltersException(final String filter1, final String filter2) { + super(String.format("Two read filters are enabled that are incompatible and cannot be used simultaneously: %s and %s", filter1, filter2)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + + public static class UnsupportedCigarOperatorException extends UserException { + public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord read, final String message) { + super(String.format( + "Unsupported CIGAR operator %s in read %s at %s:%d. %s", + co, + read.getReadName(), + read.getReferenceName(), + read.getAlignmentStart(), + message)); + } + } + + + public static class MalformedGenomeLoc extends UserException { + public MalformedGenomeLoc(String message, GenomeLoc loc) { + super(String.format("Badly formed genome loc: %s: %s", message, loc)); + } + + public MalformedGenomeLoc(String message) { + super(String.format("Badly formed genome loc: %s", message)); + } + } + + public static class BadInput extends UserException { + public BadInput(String message) { + super(String.format("Bad input: %s", message)); + } + } + + // todo -- fix up exception cause passing + public static class MissingArgument extends CommandLineException { + public MissingArgument(String arg, String message) { + super(String.format("Argument %s was missing: %s", arg, message)); + } + } + + public static class BadArgumentValue extends CommandLineException { + public BadArgumentValue(String arg, String message) { + super(String.format("Argument %s has a bad value: %s", arg, message)); + } + } + + public static class UnknownTribbleType extends CommandLineException { + public UnknownTribbleType(String type, String message) { + super(String.format("Unknown tribble type %s: %s", type, message)); + } + } + + + public static class BadTmpDir extends UserException { + public BadTmpDir(String message) { + super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); + } + } + + public static class TooManyOpenFiles extends UserException { + public TooManyOpenFiles() { + super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); + } + } + + public static class LocalParallelizationProblem extends UserException { + public LocalParallelizationProblem(final File file) { + super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); + } + } + + public static class NotEnoughMemory extends UserException { + public NotEnoughMemory() { + super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); + } + } + + public static class ErrorWritingBamFile extends UserException { + public ErrorWritingBamFile(String message) { + super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); + } + } + + public static class NoSpaceOnDevice extends UserException { + public NoSpaceOnDevice() { + super("There is no space left on the device, so writing failed"); + } + } + + public static class CouldNotReadInputFile extends UserException { + public CouldNotReadInputFile(String message, Exception e) { + super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); + } + + public CouldNotReadInputFile(File file) { + super(String.format("Couldn't read file %s", file.getAbsolutePath())); + } + + public CouldNotReadInputFile(File file, String message) { + super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); + } + + public CouldNotReadInputFile(String file, String message) { + super(String.format("Couldn't read file %s because %s", file, message)); + } + + public CouldNotReadInputFile(File file, String message, Exception e) { + super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + } + + public CouldNotReadInputFile(File file, Exception e) { + this(file, getMessage(e)); + } + + public CouldNotReadInputFile(String message) { + super(message); + } + } + + + public static class CouldNotCreateOutputFile extends UserException { + public CouldNotCreateOutputFile(File file, String message, Exception e) { + super(String.format("Couldn't write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + } + + public CouldNotCreateOutputFile(File file, String message) { + super(String.format("Couldn't write file %s because %s", file.getAbsolutePath(), message)); + } + + public CouldNotCreateOutputFile(String filename, String message, Exception e) { + super(String.format("Couldn't write file %s because %s with exception %s", filename, message, getMessage(e))); + } + + public CouldNotCreateOutputFile(File file, Exception e) { + super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); + } + + public CouldNotCreateOutputFile(String message, Exception e) { + super(message, e); + } + } + + public static class MissortedBAM extends UserException { + public MissortedBAM(SAMFileHeader.SortOrder order, File file, SAMFileHeader header) { + super(String.format("Missorted Input SAM/BAM files: %s is must be sorted in %s order but order was: %s", file, order, header.getSortOrder())); + } + + public MissortedBAM(SAMFileHeader.SortOrder order, String message) { + super(String.format("Missorted Input SAM/BAM files: files are not sorted in %s order; %s", order, message)); + } + + public MissortedBAM(SAMFileHeader.SortOrder order, SAMRecord read, String message) { + super(String.format("Missorted Input SAM/BAM file %s: file sorted in %s order but %s is required; %s", + read.getFileSource().getReader(), read.getHeader().getSortOrder(), order, message)); + } + + public MissortedBAM(String message) { + super(String.format("Missorted Input SAM/BAM files: %s", message)); + } + } + + public static class MalformedBAM extends UserException { + public MalformedBAM(SAMRecord read, String message) { + this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); + } + + public MalformedBAM(File file, String message) { + this(file.toString(), message); + } + + public MalformedBAM(String source, String message) { + super(String.format("SAM/BAM file %s is malformed: %s", source, message)); + } + } + + public static class MisencodedBAM extends UserException { + public MisencodedBAM(SAMRecord read, String message) { + this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); + } + + public MisencodedBAM(String source, String message) { + super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message)); + } + } + + public static class MalformedVCF extends UserException { + public MalformedVCF(String message, String line) { + super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + } + + public MalformedVCF(String message) { + super(String.format("The provided VCF file is malformed: %s", message)); + } + + public MalformedVCF(String message, int lineNo) { + super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + } + } + + public static class MalformedBCF2 extends UserException { + public MalformedBCF2( String message ) { + super(String.format("Malformed BCF2 file: %s", message)); + } + } + + public static class MalformedVCFHeader extends UserException { + public MalformedVCFHeader(String message) { + super(String.format("The provided VCF file has a malformed header: %s", message)); + } + } + + public static class ReadMissingReadGroup extends MalformedBAM { + public ReadMissingReadGroup(final SAMRecord read) { + super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + } + } + + public static class ReadHasUndefinedReadGroup extends MalformedBAM { + public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { + super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); + } + } + + public static class VariantContextMissingRequiredField extends UserException { + public VariantContextMissingRequiredField(String field, VariantContext vc) { + super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); + } + } + + public static class MissortedFile extends UserException { + public MissortedFile(File file, String message, Exception e) { + super(String.format("Missorted Input file: %s is must be sorted in coordinate order. %s and got error %s", file, message, getMessage(e))); + } + } + + public static class FailsStrictValidation extends UserException { + public FailsStrictValidation(File f, String message) { + super(String.format("File %s fails strict validation: %s", f.getAbsolutePath(), message)); + } + } + + public static class MalformedFile extends UserException { + public MalformedFile(String message) { + super(String.format("Unknown file is malformed: %s", message)); + } + + public MalformedFile(String message, Exception e) { + super(String.format("Unknown file is malformed: %s caused by %s", message, getMessage(e))); + } + + public MalformedFile(File f, String message) { + super(String.format("File %s is malformed: %s", f.getAbsolutePath(), message)); + } + + public MalformedFile(File f, String message, Exception e) { + super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, getMessage(e))); + } + + public MalformedFile(String name, String message) { + super(String.format("File associated with name %s is malformed: %s", name, message)); + } + + public MalformedFile(String name, String message, Exception e) { + super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, getMessage(e))); + } + } + + public static class CannotExecuteRScript extends UserException { + public CannotExecuteRScript(String message) { + super(String.format("Unable to execute RScript command: " + message)); + } + public CannotExecuteRScript(String message, Exception e) { + super(String.format("Unable to execute RScript command: " + message), e); + } + } + + public static class DeprecatedArgument extends CommandLineException { + public DeprecatedArgument(String param, String doc) { + super(String.format("The parameter %s is deprecated. %s",param,doc)); + } + } + + + public static class IncompatibleSequenceDictionaries extends UserException { + public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { + super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", + name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); + } + } + + public static class LexicographicallySortedSequenceDictionary extends UserException { + public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { + super(String.format("Lexicographically sorted human genome sequence detected in %s." + + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") + + "\n %s contigs = %s", + name, name, ReadUtils.prettyPrintSequenceRecords(dict))); + } + } + + public static class DeprecatedWalker extends UserException { + public DeprecatedWalker(String walkerName, String version) { + super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); + } + } + + public static class DeprecatedAnnotation extends UserException { + public DeprecatedAnnotation(String annotationName, String version) { + super(String.format("Annotation %s is no longer available in the GATK; it has been deprecated since version %s", annotationName, version)); + } + } + + public static class CannotExecuteQScript extends UserException { + public CannotExecuteQScript(String message) { + super(String.format("Unable to execute QScript: " + message)); + } + public CannotExecuteQScript(String message, Exception e) { + super(String.format("Unable to execute QScript: " + message), e); + } + } + + public static class CannotHandleGzippedRef extends UserException { + public CannotHandleGzippedRef() { + super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); + } + } + + public static class MissingReferenceFaiFile extends UserException { + public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) { + super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.", + indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); + } + } + + public static class MissingReferenceDictFile extends UserException { + public MissingReferenceDictFile( final File dictFile, final File fastaFile ) { + super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.", + dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); + } + } + + public static class UnreadableKeyException extends UserException { + public UnreadableKeyException ( File f, Exception e ) { + super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " + + "Please see %s for help.", + f.getAbsolutePath(), getMessage(e), PHONE_HOME_DOCS_URL)); + } + + public UnreadableKeyException ( String message, Exception e ) { + this(String.format("%s. Error was: %s", message, getMessage(e))); + } + + public UnreadableKeyException ( String message ) { + super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " + + "Please see %s for help.", + message, PHONE_HOME_DOCS_URL)); + } + } + + public static class KeySignatureVerificationException extends UserException { + public KeySignatureVerificationException ( File f ) { + super(String.format("The signature in key file %s failed cryptographic verification. " + + "If this key was valid in the past, it's likely been revoked. " + + "Please see %s for help.", + f.getAbsolutePath(), PHONE_HOME_DOCS_URL)); + } + } + + public static class GVCFIndexException extends UserException { + public GVCFIndexException (GATKVCFIndexType indexType, int indexParameter) { + super(String.format("GVCF output requires a specific indexing strategy. Please re-run including the arguments " + + "-variant_index_type %s -variant_index_parameter %d.", + indexType, indexParameter)); + } + } + + /** + * A special exception that happens only in the case where + * the filesystem, by design or configuration, is completely unable + * to handle locking. This exception will specifically NOT be thrown + * in the case where the filesystem handles locking but is unable to + * acquire a lock due to concurrency. + */ + public static class FileSystemInabilityToLockException extends UserException { + public FileSystemInabilityToLockException( String message ) { + super(message); + } + + public FileSystemInabilityToLockException( String message, Exception innerException ) { + super(message,innerException); + } + } + + public static class IncompatibleRecalibrationTableParameters extends UserException { + public IncompatibleRecalibrationTableParameters(String s) { + super(s); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fasta/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/file/FSLockWithShared.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/file/FSLockWithShared.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentCollection.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java new file mode 100644 index 000000000..49ec6f20a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -0,0 +1,384 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.genotyper; + + +import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Wrapper class that holds a set of maps of the form (Read -> Map(Allele->Double)) + * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. + */ +public class PerReadAlleleLikelihoodMap { + /** A set of all of the allele, so we can efficiently determine if an allele is already present */ + private final Set allelesSet = new HashSet<>(); + /** A list of the unique allele, as an ArrayList so we can call get(i) efficiently */ + protected final List alleles = new ArrayList<>(); + protected final Map> likelihoodReadMap = new LinkedHashMap<>(); + + public PerReadAlleleLikelihoodMap() { } + + /** + * Add a new entry into the Read -> ( Allele -> Likelihood ) map of maps. + * @param read - the GATKSAMRecord that was evaluated + * @param a - the Allele against which the GATKSAMRecord was evaluated + * @param likelihood - the likelihood score resulting from the evaluation of "read" against "a" + */ + public void add(final GATKSAMRecord read, final Allele a, final Double likelihood) { + if ( read == null ) throw new IllegalArgumentException("Cannot add a null read to the allele likelihood map"); + if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map"); + if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null"); + if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))"); + + Map likelihoodMap = likelihoodReadMap.get(read); + if (likelihoodMap == null){ + // LinkedHashMap will ensure iterating through alleles will be in consistent order + likelihoodMap = new LinkedHashMap<>(); + likelihoodReadMap.put(read,likelihoodMap); + } + + likelihoodMap.put(a,likelihood); + + if (!allelesSet.contains(a)) { + allelesSet.add(a); + alleles.add(a); + } + } + + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); + } + + /** + * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion + * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination + * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. + */ + public void performPerAlleleDownsampling(final double downsamplingFraction) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return; + if ( downsamplingFraction >= 1.0 ) { + likelihoodReadMap.clear(); + return; + } + + // start by stratifying the reads by the alleles they represent at this position + final Map> alleleReadMap = getAlleleStratifiedReadMap(); + + // compute the reads to remove and actually remove them + final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); + for ( final GATKSAMRecord read : readsToRemove ) + likelihoodReadMap.remove(read); + } + + /** + * Convert the @likelihoodReadMap to a map of alleles to reads, where each read is mapped uniquely to the allele + * for which it has the greatest associated likelihood + * @return a map from each allele to a list of reads that 'support' the allele + */ + protected Map> getAlleleStratifiedReadMap() { + final Map> alleleReadMap = new HashMap<>(alleles.size()); + for ( final Allele allele : alleles ) + alleleReadMap.put(allele, new ArrayList()); + + for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); + if ( bestAllele.isInformative() ) + alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); + } + + return alleleReadMap; + } + + @Ensures("result >=0") + public int size() { + return likelihoodReadMap.size(); + } + + /** + * Helper function to add the read underneath a pileup element to the map + * @param p Pileup element + * @param a Corresponding allele + * @param likelihood Allele likelihood + */ + public void add(PileupElement p, Allele a, Double likelihood) { + if (p==null) + throw new IllegalArgumentException("Pileup element cannot be null"); + if ( p.getRead()==null ) + throw new IllegalArgumentException("Read underlying pileup element cannot be null"); + if ( a == null ) + throw new IllegalArgumentException("Allele for add() cannot be null"); + + add(p.getRead(), a, likelihood); + } + + /** + * Does the current map contain the key associated with a particular SAM record in pileup? + * @param p Pileup element + * @return true if the map contains pileup element, else false + */ + public boolean containsPileupElement(final PileupElement p) { + return likelihoodReadMap.containsKey(p.getRead()); + } + + public boolean isEmpty() { + return likelihoodReadMap.isEmpty(); + } + + public Map> getLikelihoodReadMap() { + return likelihoodReadMap; + } + + public void clear() { + allelesSet.clear(); + alleles.clear(); + likelihoodReadMap.clear(); + } + + public Set getStoredElements() { + return likelihoodReadMap.keySet(); + } + +// public Collection> getLikelihoodMapValues() { +// return likelihoodReadMap.values(); +// } + + public int getNumberOfStoredElements() { + return likelihoodReadMap.size(); + } + + public Map getLikelihoodsAssociatedWithPileupElement(final PileupElement p) { + if (!likelihoodReadMap.containsKey(p.getRead())) + return null; + + return likelihoodReadMap.get(p.getRead()); + } + + + /** + * Get the log10 likelihood associated with an individual read/allele + * + * @param read the read whose likelihood we want + * @param allele the allele whose likelihood we want + * @return the log10 likelihood that this read matches this allele + */ + public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ + if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read)) + return 0.0; + + return likelihoodReadMap.get(read).get(allele); + } + + /** + * Get the most likely alleles estimated across all reads in this object + * + * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for + * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum + * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. + * + * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele + * doesn't have a meaningful get best likelihood. + * + * @return a MostLikelyAllele object, or null if this map is empty + */ + public MostLikelyAllele getMostLikelyDiploidAlleles() { + if ( isEmpty() ) return null; + + int hap1 = 0; + int hap2 = 0; + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < alleles.size(); iii++ ) { + final Allele iii_allele = alleles.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleles.get(jjj); + + double haplotypeLikelihood = 0.0; + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + final GATKSAMRecord read = entry.getKey(); + final double likelihood_iii = entry.getValue().get(iii_allele); + final double likelihood_jjj = entry.getValue().get(jjj_allele); + haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF; + + // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair + if ( haplotypeLikelihood < maxElement ) break; + } + + // keep track of the max element and associated indices + if ( haplotypeLikelihood > maxElement ) { + hap1 = iii; + hap2 = jjj; + maxElement = haplotypeLikelihood; + } + } + } + + if ( maxElement == Double.NEGATIVE_INFINITY ) + throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); + + return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * + * @param alleleMap - a map from alleles to likelihoods + * @return - a MostLikelyAllele object + */ + @Ensures("result != null") + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { + return getMostLikelyAllele(alleleMap, null); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * + * @param alleleMap - a map from alleles to likelihoods + * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. + * this is useful for the case where you've selected a subset of the alleles that + * the reads have been computed for further analysis. If null totally ignored + * @return - a MostLikelyAllele object + */ + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { + if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); + double maxLike = Double.NEGATIVE_INFINITY; + double prevMaxLike = Double.NEGATIVE_INFINITY; + Allele mostLikelyAllele = Allele.NO_CALL; + Allele secondMostLikely = null; + + for (final Map.Entry el : alleleMap.entrySet()) { + if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) + continue; + + if (el.getValue() > maxLike) { + prevMaxLike = maxLike; + maxLike = el.getValue(); + secondMostLikely = mostLikelyAllele; + mostLikelyAllele = el.getKey(); + } else if( el.getValue() > prevMaxLike ) { + secondMostLikely = el.getKey(); + prevMaxLike = el.getValue(); + } + } + + return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); + } + + /** + * Debug method to dump contents of object into string for display + */ + public String toString() { + final StringBuilder sb = new StringBuilder(); + + sb.append("Alelles in map:"); + for (final Allele a:alleles) { + sb.append(a.getDisplayString()+","); + } + sb.append("\n"); + for (final Map.Entry > el : getLikelihoodReadMap().entrySet() ) { + for (final Map.Entry eli : el.getValue().entrySet()) { + sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n"); + } + + } + return sb.toString(); + } + + /** + * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods + * + * Goes through each read in this map, and if it is poorly modelled removes it from the map. + * + * @see #readIsPoorlyModelled(org.broadinstitute.sting.utils.sam.GATKSAMRecord, java.util.Collection, double) + * for more information about the poorly modelled test. + * + * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled + * @return the list of reads removed from this map because they are poorly modelled + */ + public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { + final List removedReads = new LinkedList<>(); + final Iterator>> it = likelihoodReadMap.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> record = it.next(); + if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { + it.remove(); + removedReads.add(record.getKey()); + } + } + + return removedReads; + } + + /** + * Is this read poorly modelled by all of the alleles in this map? + * + * A read is poorly modeled when it's likelihood is below what would be expected for a read + * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. + * + * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood + * of the read. In other words, that the read would be fully explained by one of the alleles. This means + * that the allele should be something like the full haplotype from which the read might originate. + * + * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So + * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors + * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect + * a likelihood to be >= 10 * -3. + * + * @param read the read we want to evaluate + * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. + * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So + * 0.01 means a 1% error rate + * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes + */ + protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); + final double log10QualPerBase = -4.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + for ( final double log10Likelihood : log10Likelihoods ) + if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) + return false; + + return true; + } + + /** + * Get an unmodifiable set of the unique alleles in this PerReadAlleleLikelihoodMap + * @return a non-null unmodifiable map + */ + public Set getAllelesSet() { + return Collections.unmodifiableSet(allelesSet); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/EventMap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/EventMap.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ApplicationDetails.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ApplicationDetails.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocletUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocletUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/DocletUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocletUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java new file mode 100644 index 000000000..0afcdae02 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import java.lang.annotation.*; + +/** + * An annotation to identify a class as a GATK capability for documentation + * + * @author depristo + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DocumentedGATKFeature { + /** Should we actually document this feature, even though it's annotated? */ + public boolean enable() default true; + /** The overall group name (walkers, readfilters) this feature is associated with */ + public String groupName(); + /** A human readable summary of the purpose of this group of features */ + public String summary() default ""; + /** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */ + public Class[] extraDocs() default {}; + /** Who is the go-to developer for operation/documentation issues? */ + public String gotoDev() default "NA"; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java new file mode 100644 index 000000000..ad0959bfe --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +/** + * Documentation unit. Effectively a class version of the DocumentedGATKFeature. + * Immutable data structure. + * + * @author depristo + */ +class DocumentedGATKFeatureObject { + /** Which class are we documenting. Specific to each class being documented */ + private final Class classToDoc; + /** Are we enabled? */ + private final boolean enable; + private final String groupName, summary, gotoDev; + private final Class[] extraDocs; + + public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs, final String gotoDev) { + this.classToDoc = classToDoc; + this.enable = enable; + this.groupName = groupName; + this.summary = summary; + this.extraDocs = extraDocs; + this.gotoDev = gotoDev; + } + + public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary, final String gotoDev) { + this(classToDoc, true, groupName, summary, new Class[]{}, gotoDev); + } + + public Class getClassToDoc() { return classToDoc; } + public boolean enable() { return enable; } + public String groupName() { return groupName; } + public String summary() { return summary; } + public Class[] extraDocs() { return extraDocs; } + public String gotoDev() { return gotoDev; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumAPIUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumAPIUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumDiscussion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumDiscussion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ForumDiscussion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumDiscussion.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDoclet.java new file mode 100644 index 000000000..f0166bc9c --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDoclet.java @@ -0,0 +1,538 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; +import freemarker.template.Configuration; +import freemarker.template.DefaultObjectWrapper; +import freemarker.template.Template; +import freemarker.template.TemplateException; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker + * templates to produce html formatted GATKDocs for walkers + * and other classes. + *

+ * This document has the following workflow: + *

+ * 1 -- walk the javadoc hierarchy, looking for class that have the + * DocumentedGATKFeature annotation or are in the type hierarchy in the + * static list of things to document, and are to be documented + * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete + * set of things to document + * 3 -- for each unit, actually generate an html page documenting it + * as well as links to related features via their units. Writing + * of a specific class HTML is accomplished by a generate DocumentationHandler + * 4 -- write out an index of all units, organized by group + *

+ * The documented classes are restricted to only those with @DocumentedGATKFeature + * annotation or are in the STATIC_DOCS class. + */ +public class GATKDoclet { + final protected static Logger logger = Logger.getLogger(GATKDoclet.class); + + /** + * Where we find the help FreeMarker templates + */ + final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); + + /** + * Where we write the GATKDoc html directory + */ + final protected static File DESTINATION_DIR = new File("gatkdocs"); + + final private static String FORUM_KEY_PATH = "/local/gsa-engineering/gatkdocs_publisher/forum.key"; + // ---------------------------------------------------------------------- + // + // Global variables that are set on the command line by javadoc + // + // ---------------------------------------------------------------------- + protected static File settingsDir = SETTINGS_DIR; + protected static File destinationDir = DESTINATION_DIR; + protected static String forumKeyPath = FORUM_KEY_PATH; + protected static String buildTimestamp = null, absoluteVersion = null; + protected static boolean showHiddenFeatures = false; + + protected static boolean testOnly = false; + + /** + * Any class that's in this list will be included in the documentation + * when the -test argument is provided. Useful for debugging. + */ + private static final List> testOnlyKeepers = Arrays.asList( + DocumentationTest.class, CommandLineGATK.class, UserException.class); + + /** + * The javadoc root doc + */ + RootDoc rootDoc; + + /** + * The set of all things we are going to document + */ + Set myWorkUnits; + + /** + * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends + * one of the DocumentedGATKFeatureObjects.clazz of this collection will also + * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful + * when you want to document things that implement an interface (annotations on java + * interfaces aren't inherited) or whose base class isn't under your control (tribble + * codecs). + */ + final static Collection STATIC_DOCS = new ArrayList(); + + static { + STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, + HelpConstants.DOCS_CAT_RODCODECS, + "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED", + "NA")); + } + + + /** + * Extracts the contents of certain types of javadoc and adds them to an XML file. + * + * @param rootDoc The documentation root. + * @return Whether the JavaDoc run succeeded. + * @throws java.io.IOException if output can't be written. + */ + public static boolean start(RootDoc rootDoc) throws IOException { + logger.setLevel(Level.INFO); + + // load arguments + for (String[] options : rootDoc.options()) { + if (options[0].equals("-settings-dir")) + settingsDir = new File(options[1]); + if (options[0].equals("-destination-dir")) + destinationDir = new File(options[1]); + if (options[0].equals("-forum-key-path")) + forumKeyPath = options[1]; + if (options[0].equals("-build-timestamp")) + buildTimestamp = options[1]; + if (options[0].equals("-absolute-version")) + absoluteVersion = options[1]; + if (options[0].equals("-include-hidden")) + showHiddenFeatures = true; + if (options[0].equals("-test")) + testOnly = true; + } + + if (!settingsDir.exists()) + throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " does not exist"); + else if (!settingsDir.isDirectory()) + throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " is not a directory"); + + // process the docs + new GATKDoclet().processDocs(rootDoc); + + + return true; + } + + /** + * Validate the given options against options supported by this doclet. + * + * @param option Option to validate. + * @return Number of potential parameters; 0 if not supported. + */ + public static int optionLength(String option) { + if (option.equals("-settings-dir") || + option.equals("-destination-dir") || + option.equals("-forum-key-path") || + option.equals("-build-timestamp") || + option.equals("-absolute-version") || + option.equals("-include-hidden")) { + return 2; + } else if (option.equals("-test")) + return 1; + else + return 0; + } + + /** + * Are we supposed to include @Hidden annotations in our documented output? + * + * @return + */ + public boolean showHiddenFeatures() { + return showHiddenFeatures; + } + + /** + * @param rootDoc + */ + private void processDocs(RootDoc rootDoc) { + // setup the global access to the root + this.rootDoc = rootDoc; + + try { + // basic setup + destinationDir.mkdirs(); + FileUtils.copyFile(new File(settingsDir + "/bootstrap.min.css"), new File(destinationDir + "/bootstrap.min.css")); + FileUtils.copyFile(new File(settingsDir + "/bootstrap.min.js"), new File(destinationDir + "/bootstrap.min.js")); + FileUtils.copyFile(new File(settingsDir + "/jquery.min.js"), new File(destinationDir + "/jquery.min.js")); + // print the Version number + FileUtils.writeByteArrayToFile(new File(destinationDir + "/current.version.txt"), getSimpleVersion(absoluteVersion).getBytes()); + + /* ------------------------------------------------------------------- */ + /* You should do this ONLY ONCE in the whole application life-cycle: */ + + Configuration cfg = new Configuration(); + // Specify the data source where the template files come from. + cfg.setDirectoryForTemplateLoading(settingsDir); + // Specify how templates will see the data-model. This is an advanced topic... + cfg.setObjectWrapper(new DefaultObjectWrapper()); + + myWorkUnits = computeWorkUnits(); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for (GATKDocWorkUnit workUnit : myWorkUnits) { + data.add(workUnit.indexDataMap()); + if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + for (GATKDocWorkUnit workUnit : myWorkUnits) { + processDocWorkUnit(cfg, workUnit, groups, data); + } + + processIndex(cfg, new ArrayList(myWorkUnits)); + + File forumKeyFile = new File(forumKeyPath); + if (forumKeyFile.exists()) { + String forumKey = null; + // Read in a one-line file so we can do a for loop + for (String line : new XReadLines(forumKeyFile)) + forumKey = line; + updateForum(myWorkUnits, forumKey); + } + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void updateForum(Set docWorkUnits, String forumKey) { + //first get list of posts that need to be added + List old = ForumAPIUtils.getPostedTools(forumKey); + + for (String s : old) + System.out.println(s); + + System.out.printf("Forum has %d items%n", old.size()); + System.out.printf("Docs have %d items%n", docWorkUnits.size()); + + List toAdd = new ArrayList(); + for (GATKDocWorkUnit tool : docWorkUnits) { + if (!old.contains(tool.name)) { + System.out.println("WILL POST: " + tool.name + " TO FORUM"); + toAdd.add(tool); + } + } + + //update using list + for (GATKDocWorkUnit tool : toAdd) { + //if ( tool.name.equals("ApplyRecalibration") ) + ForumAPIUtils.postToForum(tool, forumKey); + } + } + + /** + * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. + * + * @return + */ + private Set computeWorkUnits() { + TreeSet m = new TreeSet(); + + for (ClassDoc doc : rootDoc.classes()) { + //logger.debug("Considering " + doc); + Class clazz = getClassForClassDoc(doc); + + // don't add anything that's not DocumentationTest if we are in test mode + if (clazz != null && testOnly && !testOnlyKeepers.contains(clazz)) + continue; + + //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) + // logger.debug("foo"); + + DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); + DocumentedGATKFeatureHandler handler = createHandler(doc, feature); + if (handler != null && handler.includeInDocs(doc)) { + //logger.info("Generating documentation for class " + doc); + String filename = handler.getDestinationFilename(doc, clazz); + GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), + filename, feature.groupName(), feature, handler, doc, clazz, + buildTimestamp, absoluteVersion); + m.add(unit); + } + } + + return m; + } + + /** + * Create a handler capable of documenting the class doc according to feature. Returns + * null if no appropriate handler is found or doc shouldn't be documented at all. + * + * @param doc + * @param feature + * @return + */ + private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { + if (feature != null) { + if (feature.enable()) { + DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); + handler.setDoclet(this); + return handler; + } else { + logger.info("Skipping disabled Documentation for " + doc); + } + } + + return null; + } + + /** + * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc + * structure we will apply to Doc. + * + * @param doc + * @return null if this proves inappropriate or doc shouldn't be documented + */ + private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { + Class docClass = getClassForClassDoc(doc); + + if (docClass == null) + return null; // not annotated so it shouldn't be documented + + if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) { + DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); + return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs(), f.gotoDev()); + } else { + for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) { + if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) { + return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs(), staticDocs.gotoDev()); + } + } + return null; + } + } + + /** + * Return the Java class described by the ClassDoc doc + * + * @param doc + * @return + */ + private Class getClassForClassDoc(ClassDoc doc) { + try { + // todo -- what do I need the ? extends Object to pass the compiler? + return (Class) DocletUtils.getClassForDoc(doc); + } catch (ClassNotFoundException e) { + //logger.warn("Couldn't find class for ClassDoc " + doc); + // we got a classdoc for a class we can't find. Maybe in a library or something + return null; + } catch (NoClassDefFoundError e) { + return null; + } catch (UnsatisfiedLinkError e) { + return null; // naughty BWA bindings + } + } + + /** + * Create the html index listing all of the GATKDocs features + * + * @param cfg + * @param indexData + * @throws IOException + */ + private void processIndex(Configuration cfg, List indexData) throws IOException { + /* Get or create a template */ + Template temp = cfg.getTemplate("generic.index.template.html"); + + /* Merge data-model with template */ + Writer out = new OutputStreamWriter(new FileOutputStream(new File(destinationDir + "/index.html"))); + try { + temp.process(groupIndexData(indexData), out); + out.flush(); + } catch (TemplateException e) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } + + /** + * Helpful function to create the html index. Given all of the already run GATKDocWorkUnits, + * create the high-level grouping data listing individual features by group. + * + * @param indexData + * @return + */ + private Map groupIndexData(List indexData) { + // + // root -> data -> { summary -> y, filename -> z }, etc + // -> groups -> group1, group2, etc. + Map root = new HashMap(); + + Collections.sort(indexData); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for (GATKDocWorkUnit workUnit : indexData) { + data.add(workUnit.indexDataMap()); + if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + //System.out.printf(groups.toString()); + + root.put("data", data); + root.put("groups", groups); + root.put("timestamp", buildTimestamp); + root.put("version", absoluteVersion); + + return root; + } + + /** + * Trivial helper routine that returns the map of name and summary given the annotation + * AND adds a super-category so that we can custom-order the categories in the index + * + * @param annotation + * @return + */ + private static final Map toMap(DocumentedGATKFeatureObject annotation) { + Map root = new HashMap(); + root.put("id", annotation.groupName().replaceAll("\\W", "")); + root.put("name", annotation.groupName()); + root.put("summary", annotation.summary()); + + /** + * Add-on super-category definitions. The assignments depend on parsing the names + * defined in HelpConstants.java so be careful of changing anything. + * Also, the super-category value strings need to be the same as used in the + * Freemarker template. This is all fairly clunky but the best I could do without + * making major changes to the DocumentedGATKFeatureObject. Doesn't help that + * Freemarker makes any scripting horribly awkward. + */ + final String supercatValue; + if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools"; + else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities"; + else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine"; + else if (annotation.groupName().endsWith(" (DevZone)")) supercatValue = "dev"; + else supercatValue = "other"; + + root.put("supercat", supercatValue); + + return root; + } + + /** + * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units + * + * @param c the class we are looking for + * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found + */ + public final GATKDocWorkUnit findWorkUnitForClass(Class c) { + for (final GATKDocWorkUnit unit : this.myWorkUnits) + if (unit.clazz.equals(c)) + return unit; + return null; + } + + /** + * Return the ClassDoc associated with clazz + * + * @param clazz + * @return + */ + public ClassDoc getClassDocForClass(Class clazz) { + return rootDoc.classNamed(clazz.getName()); + } + + /** + * High-level function that processes a single DocWorkUnit unit using its handler + * + * @param cfg + * @param unit + * @param data + * @throws IOException + */ + private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, List> groups, List> data) + throws IOException { + //System.out.printf("Processing documentation for class %s%n", unit.classDoc); + + unit.handler.processOne(unit); + unit.forTemplate.put("groups", groups); + unit.forTemplate.put("data", data); + // Get or create a template + Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); + + // Merge data-model with template + File outputPath = new File(destinationDir + "/" + unit.filename); + try { + Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); + temp.process(unit.forTemplate, out); + out.flush(); + } catch (TemplateException e) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } + + private static String getSimpleVersion(String absoluteVersion) { + String[] parts = absoluteVersion.split("-"); + + // by skipping i=0, there is no trailing separator + for (int i = 1; i < 2; i++) { + parts[0] = parts[0].concat("-"); + parts[0] = parts[0].concat(parts[i]); + } + + return parts[0]; + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java new file mode 100644 index 000000000..06c0e1c26 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -0,0 +1,934 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.Tag; +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.*; +import java.util.*; + +/** + * + */ +public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { + private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); + + /** + * The max. length of the longest of --fullName -shortName argument name + * before we prefer the shorter option. + */ + private static final int MAX_DISPLAY_NAME = 30; + + /** + * The Class we are documenting + */ + private GATKDocWorkUnit toProcess; + + @Override + public boolean includeInDocs(ClassDoc doc) { + try { + Class type = DocletUtils.getClassForDoc(doc); + boolean hidden = !getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class); + return !hidden && JVMUtils.isConcrete(type); + } catch (ClassNotFoundException e) { + return false; + } + } + + + @Override + public String getTemplateName(ClassDoc doc) throws IOException { + return "generic.template.html"; + } + + @Override + public void processOne(GATKDocWorkUnit toProcessArg) { + this.toProcess = toProcessArg; + + //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); + Map root = new HashMap(); + + addHighLevelBindings(root); + addArgumentBindings(root); + addRelatedBindings(root); + root.put("group", toProcess.group); + + // Adding in retrieval of peripheral info (rf annotations etc) + getClazzAnnotations(toProcess.clazz, root); + + toProcess.setHandlerContent((String) root.get("summary"), root); + } + + /** + * Add high-level summary information about toProcess to root, such as its + * name, summary, description, version, etc. + * + * @param root + */ + protected void addHighLevelBindings(Map root) { + root.put("name", toProcess.classDoc.name()); + + // Extract overrides from the doc tags. + StringBuilder summaryBuilder = new StringBuilder(); + for (Tag tag : toProcess.classDoc.firstSentenceTags()) + summaryBuilder.append(tag.text()); + root.put("summary", summaryBuilder.toString()); + root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); + root.put("timestamp", toProcess.buildTimestamp); + root.put("version", toProcess.absoluteVersion); + + for (Tag tag : toProcess.classDoc.tags()) { + root.put(tag.name(), tag.text()); + } + + root.put("gotoDev", toProcess.annotation.gotoDev()); + } + + /** + * Add bindings describing related GATK capabilites to toProcess + * + * @param root + */ + protected void addRelatedBindings(Map root) { + List> extraDocsData = new ArrayList>(); + + // add in all of the explicitly related items + for (final Class extraDocClass : toProcess.annotation.extraDocs()) { + final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); + if (otherUnit == null) + throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); + extraDocsData.add( + new HashMap() {{ + put("filename", otherUnit.filename); + put("name", otherUnit.name); + }}); + } + root.put("extradocs", extraDocsData); + } + + /** + * Add information about all of the arguments available to toProcess to root + * + * @param root + */ + protected void addArgumentBindings(Map root) { + ParsingEngine parsingEngine = createStandardGATKParsingEngine(); + + Map>> args = createArgumentMap(); + root.put("arguments", args); + try { + // loop over all of the arguments according to the parsing engine + for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) { + ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); + FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); + Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); + if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) { + final String kind = docKindOfArg(argumentSource); + // Retrieve default value + final Object value = argumentValue(toProcess.clazz, argumentSource); + if (value != null) + argBindings.put("defaultValue", prettyPrintValueString(value)); + // Retrieve min and max / hard and soft value thresholds for numeric args + if (value instanceof Number) { + if (argumentSource.field.isAnnotationPresent(Argument.class)) { + argBindings.put("minValue", argumentSource.field.getAnnotation(Argument.class).minValue()); + argBindings.put("maxValue", argumentSource.field.getAnnotation(Argument.class).maxValue()); + if (argumentSource.field.getAnnotation(Argument.class).minRecommendedValue() != Double.NEGATIVE_INFINITY) { + argBindings.put("minRecValue", argumentSource.field.getAnnotation(Argument.class).minRecommendedValue()); + } + if (argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue() != Double.POSITIVE_INFINITY) { + argBindings.put("maxRecValue", argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue()); + } + } + } + // Finalize argument bindings + args.get(kind).add(argBindings); + args.get("all").add(argBindings); + } + } + + // sort the arguments + for (Map.Entry>> entry : args.entrySet()) { + entry.setValue(sortArguments(entry.getValue())); + } + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Return the argument kind (required, advanced, hidden, etc) of this argumentSource + * + * @param argumentSource + * @return + */ + @Requires("argumentSource != null") + @Ensures("result != null") + private String docKindOfArg(ArgumentSource argumentSource) { + if (argumentSource.isRequired()) { + if (argumentSource.isInput()) return "required_in"; + else if (argumentSource.isOutput()) return "required_out"; + else if (argumentSource.isFlag()) return "required_flag"; + else return "required_param"; + } + else if (argumentSource.isAdvanced()) { + if (argumentSource.isInput()) return "advanced_in"; + else if (argumentSource.isOutput()) return "advanced_out"; + else if (argumentSource.isFlag()) return "advanced_flag"; + else return "advanced_param"; + } + else if (argumentSource.isHidden()) return "hidden"; + else if (argumentSource.isDeprecated()) return "deprecated"; + else { + if (argumentSource.isInput()) return "optional_in"; + else if (argumentSource.isOutput()) return "optional_out"; + else if (argumentSource.isFlag()) return "optional_flag"; + else return "optional_param"; + } + } + + /** + * Attempts to determine the value of argumentSource in an instantiated version of c + * + * @param c + * @param argumentSource + * @return value of argumentSource, or null if this isn't possible + */ + @Requires({"c != null", "argumentSource != null"}) + private Object argumentValue(Class c, ArgumentSource argumentSource) { + // get the value of the field + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(toProcess.clazz); + if (instance != null) { + final Object value = getFieldValue(instance, argumentSource.field.getName()); + if (value != null) + return value; + + if (argumentSource.createsTypeDefault()) { + try { // handle the case where there's an implicit default + return argumentSource.typeDefaultDocString(); + } catch (ReviewedStingException e) { + ; // failed to create type default, don't worry about it + } + } + } + + return null; + } + + /** + * Create the argument map for holding class arguments + * + * @return + */ + private Map>> createArgumentMap() { + Map>> args = new HashMap>>(); + args.put("all", new ArrayList>()); + args.put("required_in", new ArrayList>()); + args.put("required_out", new ArrayList>()); + args.put("required_param", new ArrayList>()); + args.put("required_flag", new ArrayList>()); + args.put("optional_in", new ArrayList>()); + args.put("optional_out", new ArrayList>()); + args.put("optional_param", new ArrayList>()); + args.put("optional_flag", new ArrayList>()); + args.put("advanced_in", new ArrayList>()); + args.put("advanced_out", new ArrayList>()); + args.put("advanced_param", new ArrayList>()); + args.put("advanced_flag", new ArrayList>()); + args.put("hidden", new ArrayList>()); + args.put("deprecated", new ArrayList>()); + return args; + } + + + /** + * Sorts the individual argument list in unsorted according to CompareArgumentsByName + * + * @param unsorted + * @return + */ + private List> sortArguments(List> unsorted) { + Collections.sort(unsorted, new CompareArgumentsByName()); + return unsorted; + } + + /** + * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes + */ + private class CompareArgumentsByName implements Comparator> { + public int compare(Map x, Map y) { + return elt(x).compareTo(elt(y)); + } + + private String elt(Map m) { + String v = m.get("name").toString().toLowerCase(); + if (v.startsWith("--")) + return v.substring(2); + else if (v.startsWith("-")) + return v.substring(1); + else + throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); + } + } + + /** + * Umbrella function that groups the collection of values for specific annotations applied to an + * instance of class c. Lists of collected values are added directly to the "toProcess" object. + * Requires being able to instantiate the class. + * + * @param classToProcess the object to instantiate and query for the annotation + * @param root the root of the document handler, to which we'll store collected annotations + */ + private void getClazzAnnotations(Class classToProcess, Map root) { + // + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(classToProcess); + if (instance != null) { + final Class myClass = instance.getClass(); + // Get parallelism options + final HashSet> parallelOptions = getParallelism(myClass, new HashSet>()); + root.put("parallel", parallelOptions); + // Get annotation info (what type of annotation, standard etc.) + final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); + root.put("annotinfo", StringUtils.join(annotInfo, ", ")); + // Get annotation field (whether it goes in INFO or FORMAT) + root.put("annotfield", getAnnotField(myClass)); + // Get walker type if applicable + root.put("walkertype", getWalkerType(myClass)); + // Get partition type if applicable + root.put("partitiontype", getPartitionType(myClass)); + // Get read filter annotations (ReadFilters) if applicable + final HashSet> bucket= getReadFilters(myClass, new HashSet>()); + root.put("readfilters", bucket); + // Get default downsampling settings + final HashMap dsSettings = getDownSamplingSettings(myClass, new HashMap()); + root.put("downsampling", dsSettings); + // Get reference window size settings + final HashMap refwindow = getRefWindow(myClass, new HashMap()); + root.put("refwindow", refwindow); + // Get ActiveRegion size settings + final HashMap activeRegion = getActiveRegion(myClass, new HashMap()); + root.put("activeregion", activeRegion); + // anything else? + } else { + // put empty items to avoid blowups + root.put("parallel", new HashSet()); + root.put("annotinfo", ""); + root.put("annotfield", ""); + root.put("walkertype", ""); + root.put("partitiontype", ""); + root.put("readfilters", new HashSet>()); + root.put("downsampling", new HashMap()); + root.put("refwindow", new HashMap()); + root.put("activeregion", new HashMap()); + } + } + + /** + * Utility function that checks which parallelism options are available for an instance of class c. + * + * @param myClass the class to query for the interfaces + * @param parallelOptions an empty HashSet in which to collect the info + * @return a hash set of parallelism options, otherwise an empty set + */ + private HashSet> getParallelism(Class myClass, HashSet> parallelOptions) { + // + // Retrieve interfaces + Class[] implementedInterfaces = myClass.getInterfaces(); + for (Class intfClass : implementedInterfaces) { + final HashMap nugget = new HashMap(); + if (intfClass.getSimpleName().equals("TreeReducible")) { + nugget.put("name", intfClass.getSimpleName()); + nugget.put("arg", HelpConstants.ARG_TREEREDUCIBLE); + nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_TREEREDUCIBLE); + } else if (intfClass.getSimpleName().equals("NanoSchedulable")) { + nugget.put("name", intfClass.getSimpleName()); + nugget.put("arg", HelpConstants.ARG_NANOSCHEDULABLE); + nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_NANOSCHEDULABLE); + } else { + continue; + } + parallelOptions.add(nugget); + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return parallelOptions; + } + return getParallelism(mySuperClass, parallelOptions); + } + + /** + * Utility function that looks up whether the annotation goes in INFO or FORMAT field. + * + * @param myClass the class to query for the interfaces + * @return a String specifying the annotation field + */ + private final String getAnnotField(Class myClass) { + // + // Look up superclasses recursively until we find either + // GenotypeAnnotation or InfoFieldAnnotation + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass == InfoFieldAnnotation.class) { + return "INFO (variant-level)"; + } else if (mySuperClass == GenotypeAnnotation.class) { + return "FORMAT (sample genotype-level)"; + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getAnnotField(mySuperClass); + } + + /** + * Utility function that determines the annotation type for an instance of class c. + * + * @param myClass the class to query for the interfaces + * @param annotInfo an empty HashSet in which to collect the info + * @return a hash set of the annotation types, otherwise an empty set + */ + private HashSet getAnnotInfo(Class myClass, HashSet annotInfo) { + // + // Retrieve interfaces + Class[] implementedInterfaces = myClass.getInterfaces(); + for (Class intfClass : implementedInterfaces) { + if (intfClass.getName().contains("Annotation")) { + annotInfo.add(intfClass.getSimpleName()); + } + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return annotInfo; + } + return getAnnotInfo(mySuperClass, annotInfo); + } + + /** + * Utility function that determines the default downsampling settings for an instance of class c. + * + * @param myClass the class to query for the settings + * @param dsSettings an empty HashMap in which to collect the info + * @return a hash set of the downsampling settings, otherwise an empty set + */ + private HashMap getDownSamplingSettings(Class myClass, HashMap dsSettings) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(Downsample.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(Downsample.class); + if(thisAnnotation instanceof Downsample) { + final Downsample dsAnnotation = (Downsample) thisAnnotation; + dsSettings.put("by", dsAnnotation.by().toString()); + dsSettings.put("to_cov", dsAnnotation.toCoverage()); + } + } + return dsSettings; + } + + /** + * Utility function that determines the reference window size for an instance of class c. + * + * @param myClass the class to query for the settings + * @param refWindow an empty HashMap in which to collect the info + * @return a HashMap of the window start and stop, otherwise an empty HashMap + */ + private HashMap getRefWindow(Class myClass, HashMap refWindow) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(Reference.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(Reference.class); + if(thisAnnotation instanceof Reference) { + final Reference refAnnotation = (Reference) thisAnnotation; + refWindow.put("start", refAnnotation.window().start()); + refWindow.put("stop", refAnnotation.window().stop()); + } + } + return refWindow; + } + + /** + * Utility function that determines the ActiveRegion settings for an instance of class c. + * + * @param myClass the class to query for the settings + * @param activeRegion an empty HashMap in which to collect the info + * @return a HashMap of the ActiveRegion parameters, otherwise an empty HashMap + */ + private HashMap getActiveRegion(Class myClass, HashMap activeRegion) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(ActiveRegionTraversalParameters.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(ActiveRegionTraversalParameters.class); + if(thisAnnotation instanceof ActiveRegionTraversalParameters) { + final ActiveRegionTraversalParameters arAnnotation = (ActiveRegionTraversalParameters) thisAnnotation; + activeRegion.put("ext", arAnnotation.extension()); + activeRegion.put("max", arAnnotation.maxRegion()); + activeRegion.put("min", arAnnotation.minRegion()); + } + } + return activeRegion; + } + + /** + * Utility function that determines the partition type of an instance of class c. + * + * @param myClass the class to query for the annotation + * @return the partition type if applicable, otherwise an empty string + */ + private String getPartitionType(Class myClass) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(PartitionBy.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(PartitionBy.class); + if(thisAnnotation instanceof PartitionBy) { + final PartitionBy partAnnotation = (PartitionBy) thisAnnotation; + return partAnnotation.value().toString(); + } + } + return ""; + } + + /** + * Utility function that determines the type of walker subclassed by an instance of class c. + * + * @param myClass the class to query for the annotation + * @return the type of walker if applicable, otherwise an empty string + */ + private String getWalkerType(Class myClass) { + // + // Look up superclasses recursively until we find either Walker or Object + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Walker")) { + return myClass.getSimpleName(); + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getWalkerType(mySuperClass); + } + + /** + * Utility function that finds the values of ReadFilters annotation applied to an instance of class c. + * + * @param myClass the class to query for the annotation + * @param bucket a container in which we store the annotations collected + * @return a hash set of values, otherwise an empty set + */ + private HashSet> getReadFilters(Class myClass, HashSet> bucket) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(ReadFilters.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(ReadFilters.class); + if(thisAnnotation instanceof ReadFilters) { + final ReadFilters rfAnnotation = (ReadFilters) thisAnnotation; + for (Class filter : rfAnnotation.value()) { + // make hashmap of simplename and url + final HashMap nugget = new HashMap(); + nugget.put("name", filter.getSimpleName()); + nugget.put("filename", GATKDocUtils.htmlFilenameForClass(filter)); + bucket.add(nugget); + } + } + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return bucket; + } + return getReadFilters(mySuperClass, bucket); + } + + + /** + * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in + * instance of class c. + * + * @param instance the object to query for the field value + * @param fieldName the name of the field we are looking for in instance + * @return The value assigned to field in the ArgumentCollection, otherwise null + */ + private Object getFieldValue(Object instance, String fieldName) { + // + // subtle note. If you have a field named X that is an ArgumentCollection that + // contains a field X as well, you need only consider fields in the argumentCollection, not + // matching the argument itself. + // + // @ArgumentCollection + // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + // + + for (Field field : JVMUtils.getAllFields(instance.getClass())) { + if (field.isAnnotationPresent(ArgumentCollection.class)) { + //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); + Object fieldValue = JVMUtils.getFieldValue(field, instance); + Object value = getFieldValue(fieldValue, fieldName); + if (value != null) + return value; + } else if (field.getName().equals(fieldName)) { + return JVMUtils.getFieldValue(field, instance); + } + } + + return null; + } + + /** + * Pretty prints value + *

+ * Assumes value != null + * + * @param value + * @return + */ + private Object prettyPrintValueString(Object value) { + if (value.getClass().isArray()) { + Class type = value.getClass().getComponentType(); + if (boolean.class.isAssignableFrom(type)) + return Arrays.toString((boolean[]) value); + if (byte.class.isAssignableFrom(type)) + return Arrays.toString((byte[]) value); + if (char.class.isAssignableFrom(type)) + return Arrays.toString((char[]) value); + if (double.class.isAssignableFrom(type)) + return Arrays.toString((double[]) value); + if (float.class.isAssignableFrom(type)) + return Arrays.toString((float[]) value); + if (int.class.isAssignableFrom(type)) + return Arrays.toString((int[]) value); + if (long.class.isAssignableFrom(type)) + return Arrays.toString((long[]) value); + if (short.class.isAssignableFrom(type)) + return Arrays.toString((short[]) value); + if (Object.class.isAssignableFrom(type)) + return Arrays.toString((Object[]) value); + else + throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); + } else if (RodBinding.class.isAssignableFrom(value.getClass())) { + // annoying special case to handle the UnBound() constructor + return "none"; + } else if (value instanceof String) { + return value.equals("") ? "\"\"" : value; + } else { + return value.toString(); + } + } + + /** + * Attempt to instantiate class c, if possible. Returns null if this proves impossible. + * + * @param c + * @return + */ + private Object makeInstanceIfPossible(Class c) { + Object instance = null; + try { + // don't try to make something where we will obviously fail + if (!c.isEnum() && !c.isAnnotation() && !c.isAnonymousClass() && + !c.isArray() && !c.isPrimitive() & JVMUtils.isConcrete(c)) { + instance = c.newInstance(); + //System.out.printf("Created object of class %s => %s%n", c, instance); + return instance; + } else + return null; + } catch (IllegalAccessException e) { + } catch (InstantiationException e) { + } catch (ExceptionInInitializerError e) { + } catch (SecurityException e) { + } + // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions + // and rethrow then as RuntimeExceptions + catch (RuntimeException e) { + } + + return instance; + } + + + /** + * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet + * + * @return + */ + private ParsingEngine createStandardGATKParsingEngine() { + CommandLineProgram clp = new CommandLineGATK(); + try { + CommandLineProgram.start(clp, new String[]{}, true); + return clp.parser; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Gets the javadocs associated with field name in classDoc. Throws a + * runtime exception if this proves impossible. + * + * @param classDoc + * @param name + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { + return getFieldDoc(classDoc, name, true); + } + + /** + * Recursive helper routine to getFieldDoc() + * + * @param classDoc + * @param name + * @param primary + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { + //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); + for (FieldDoc fieldDoc : classDoc.fields(false)) { + //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); + if (fieldDoc.name().equals(name)) + return fieldDoc; + + Field field = DocletUtils.getFieldForFieldDoc(fieldDoc); + if (field == null) + throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); + if (field.isAnnotationPresent(ArgumentCollection.class)) { + ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); + if (typeDoc == null) + throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); + else { + FieldDoc result = getFieldDoc(typeDoc, name, false); + if (result != null) + return result; + // else keep searching + } + } + } + + // if we didn't find it here, wander up to the superclass to find the field + if (classDoc.superclass() != null) { + return getFieldDoc(classDoc.superclass(), name, false); + } + + if (primary) + throw new RuntimeException("No field found for expected field " + name); + else + return null; + } + + /** + * Returns a Pair of (main, synonym) names for argument with fullName s1 and + * shortName s2. + * + * Previously we had it so the main name was selected to be the longest of the two, provided + * it didn't exceed MAX_DISPLAY_NAME, in which case the shorter was taken. But we now disable + * the length-based name rearrangement in order to maintain consistency in the GATKDocs table. + * + * This may cause messed up spacing in the CLI-help display but we don't care as much about that + * since more users use the online GATKDocs for looking up arguments. + * + * @param s1 the short argument name without -, or null if not provided + * @param s2 the long argument name without --, or null if not provided + * @return A pair of fully qualified names (with - or --) for the argument. The first + * element is the primary display name while the second (potentially null) is a + * synonymous name. + */ + Pair displayNames(String s1, String s2) { + s1 = s1 == null ? null : "-" + s1; + s2 = s2 == null ? null : "--" + s2; + + if (s1 == null) return new Pair(s2, null); + if (s2 == null) return new Pair(s1, null); + + return new Pair(s2, s1); + } + + /** + * Returns a human readable string that describes the Type type of a GATK argument. + *

+ * This will include parameterized types, so that Set{T} shows up as Set(T) and not + * just Set in the docs. + * + * @param type + * @return + */ + protected String argumentTypeString(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType) type; + List subs = new ArrayList(); + for (Type actualType : parameterizedType.getActualTypeArguments()) + subs.add(argumentTypeString(actualType)); + return argumentTypeString(((ParameterizedType) type).getRawType()) + "[" + Utils.join(",", subs) + "]"; + } else if (type instanceof GenericArrayType) { + return argumentTypeString(((GenericArrayType) type).getGenericComponentType()) + "[]"; + } else if (type instanceof WildcardType) { + throw new RuntimeException("We don't support wildcards in arguments: " + type); + } else if (type instanceof Class) { + return ((Class) type).getSimpleName(); + } else { + throw new StingException("Unknown type: " + type); + } + } + + /** + * Helper routine that returns the Feature.class required by a RodBinding, + * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if + * the Type doesn't fit either model. + * + * @param type + * @return + */ + protected Class getFeatureTypeIfPossible(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType paramType = (ParameterizedType) type; + if (RodBinding.class.isAssignableFrom((Class) paramType.getRawType())) { + return (Class) JVMUtils.getParameterizedTypeClass(type); + } else { + for (Type paramtype : paramType.getActualTypeArguments()) { + Class x = getFeatureTypeIfPossible(paramtype); + if (x != null) + return x; + } + } + } + + return null; + } + + /** + * High-level entry point for creating a FreeMarker map describing the GATK argument + * source with definition def, with associated javadoc fieldDoc. + * + * @param fieldDoc + * @param source + * @param def + * @return a non-null Map binding argument keys with their values + */ + protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { + Map root = new HashMap(); + Pair names = displayNames(def.shortName, def.fullName); + + root.put("name", names.getFirst()); + + if (names.getSecond() != null) + root.put("synonyms", names.getSecond()); + + root.put("required", def.required ? "yes" : "no"); + + // type of the field + root.put("type", argumentTypeString(source.field.getGenericType())); + + Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); + if (featureClass != null) { + // deal with the allowable types + FeatureManager manager = new FeatureManager(); + List rodTypes = new ArrayList(); + for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass)) { + rodTypes.add(String.format("%s", + GATKDocUtils.htmlFilenameForClass(descriptor.getCodecClass()), + descriptor.getName())); + } + + root.put("rodTypes", Utils.join(", ", rodTypes)); + } + + // summary and fulltext + root.put("summary", def.doc != null ? def.doc : ""); + root.put("fulltext", fieldDoc.commentText()); + + // What are our enum options? + if (def.validOptions != null) + root.put("options", docForEnumArgument(source.field.getType())); + + // general attributes + List attributes = new ArrayList(); + if (def.required) attributes.add("required"); + if (source.isDeprecated()) attributes.add("deprecated"); + if (attributes.size() > 0) + root.put("attributes", Utils.join(", ", attributes)); + + return root; + } + + /** + * Helper routine that provides a FreeMarker map for an enumClass, grabbing the + * values of the enum and their associated javadoc documentation. + * + * @param enumClass + * @return + */ + @Requires("enumClass.isEnum()") + private List> docForEnumArgument(final Class enumClass) { + final ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); + if ( doc == null ) + throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got null instead"); + + final Set enumConstantFieldNames = enumConstantsNames(enumClass); + + final List> bindings = new ArrayList>(); + for (final FieldDoc fieldDoc : doc.fields(false)) { + if (enumConstantFieldNames.contains(fieldDoc.name()) ) + bindings.add( + new HashMap() {{ + put("name", fieldDoc.name()); + put("summary", fieldDoc.commentText()); + }}); + } + + return bindings; + } + + /** + * Returns the name of the fields that are enum constants according to reflection + * + * @return a non-null set of fields that are enum constants + */ + private Set enumConstantsNames(final Class enumClass) { + final Set enumConstantFieldNames = new HashSet(); + + for ( final Field field : enumClass.getFields() ) { + if ( field.isEnumConstant() ) + enumConstantFieldNames.add(field.getName()); + } + + return enumConstantFieldNames; + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpConstants.java new file mode 100644 index 000000000..783e7aa90 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpConstants.java @@ -0,0 +1,83 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +public class HelpConstants { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + /** + * Arguments for parallelism options + */ + public final static String ARG_TREEREDUCIBLE = "-nt"; + public final static String ARG_NANOSCHEDULABLE = "-nct"; + public final static String CMDLINE_GATK_URL = GATK_DOCS_URL + "org_broadinstitute_sting_gatk_CommandLineGATK.html"; + + /** + * Definition of the group names / categories of tools. + * The names get parsed to make supercategories in the doc index, + * so be careful when making big changes -- see GATKDoclet.java toMap() + */ + public final static String DOCS_CAT_DATA = "Sequence Data Processing Tools"; + public final static String DOCS_CAT_QC = "Diagnostics and Quality Control Tools"; + public final static String DOCS_CAT_ENGINE = "Engine Parameters (available to all tools)"; + public final static String DOCS_CAT_RF = "Read Filters"; + public final static String DOCS_CAT_REFUTILS = "Reference Utilities"; + public final static String DOCS_CAT_RODCODECS = "ROD Codecs"; + public final static String DOCS_CAT_USRERR = "User Exceptions (DevZone)"; + public final static String DOCS_CAT_VALIDATION = "Validation Utilities"; + public final static String DOCS_CAT_ANNOT = "Variant Annotations"; + public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; + public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; + public final static String DOCS_CAT_TOY = "Toy Walkers (DevZone)"; + public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; + + public static String forumPost(String post) { + return GATK_FORUM_URL + post; + } + + /** + * Go-to developer name codes for tracking and display purposes. Only current team members should be in this list. + * When someone leaves, their charges should be redistributed. The actual string should be closest to the dev's + * abbreviated name or two/three-letter nickname as possible. The code can be something else if necessary to + * disambiguate from other variable. + */ + public final static String MC = "MC"; // Mauricio Carneiro + public final static String EB = "EB"; // Eric Banks + public final static String RP = "RP"; // Ryan Poplin + public final static String GVDA = "GG"; // Geraldine Van der Auwera + public final static String VRR = "VRR"; // Valentin Ruano-Rubio + public final static String ALM = "ALM"; // Ami Levy-Moonshine + public final static String BH = "BH"; // Bertrand Haas + public final static String JoT = "JT"; // Joel Thibault + public final static String DR = "DR"; // David Roazen + public final static String KS = "KS"; // Khalid Shakir + + +} \ No newline at end of file diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpFormatter.java new file mode 100644 index 000000000..f2e3fad4b --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpFormatter.java @@ -0,0 +1,336 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; + +import java.net.InetAddress; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.*; +/** + * Print out help for Sting command-line applications. + */ + +public class HelpFormatter { + /** our log, which we want to capture anything from org.broadinstitute.sting */ + private static Logger logger = Logger.getLogger(HelpFormatter.class); + + public static final int FIELD_SEPARATION_WIDTH = 3; + + /** + * Prints the help, given a collection of argument definitions. + * @param applicationDetails Application details + * @param argumentDefinitions Argument definitions for which help should be printed. + */ + public void printHelp( ApplicationDetails applicationDetails, ArgumentDefinitions argumentDefinitions ) { + List argumentGroups = prepareArgumentGroups( argumentDefinitions ); + + List header = applicationDetails.applicationHeader; + String barrier = createBarrier(header); + + System.out.printf("%s%n",barrier); + for(String headerLine: header) + System.out.printf("%s%n",headerLine); + System.out.printf("%s%n",barrier); + for(String attributionLine: applicationDetails.attribution) + System.out.printf("%s%n",attributionLine); + System.out.printf("%s%n",barrier); + + String synopsis = getSynopsis(applicationDetails.runningInstructions,argumentGroups); + String additionalDetails = applicationDetails.additionalHelp != null ? applicationDetails.additionalHelp : ""; + String detailedDescription = getDetailed(argumentGroups); + + System.out.printf("%s%n%s%n%s%n",synopsis,detailedDescription,additionalDetails ); + } + + /** + * Gets the synopsis: the actual command to run. + * @param runningInstructions Instructions on how to run hte application. + * @param argumentGroups Program arguments sorted in order of definition group displays. + * @return A synopsis line. + */ + private String getSynopsis( String runningInstructions, + List argumentGroups ) { + // Build out the synopsis all as one long line. + StringBuilder lineBuilder = new StringBuilder(); + Formatter lineFormatter = new Formatter( lineBuilder ); + + lineFormatter.format("java %s", runningInstructions); + + for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) { + for( ArgumentDefinition argumentDefinition: argumentGroup.argumentDefinitions ) { + if(argumentDefinition.isHidden) + continue; + lineFormatter.format(" "); + if( !argumentDefinition.required ) lineFormatter.format("["); + if( argumentDefinition.shortName != null ) + lineFormatter.format("-%s", argumentDefinition.shortName); + else + lineFormatter.format("--%s", argumentDefinition.fullName); + if( !argumentDefinition.isFlag ) + lineFormatter.format(" <%s>", argumentDefinition.fullName); + if( !argumentDefinition.required ) lineFormatter.format("]"); + } + } + + // Word wrap the synopsis. + List wrappedSynopsis = TextFormattingUtils.wordWrap( lineBuilder.toString(), TextFormattingUtils.DEFAULT_LINE_WIDTH ); + + String header = "usage: "; + int headerLength = header.length(); + + StringBuilder synopsisBuilder = new StringBuilder(); + Formatter synopsisFormatter = new Formatter(synopsisBuilder); + for( String synopsisLine: wrappedSynopsis ) { + synopsisFormatter.format("%" + headerLength + "s%s%n", header, synopsisLine); + header = ""; + } + + return synopsisBuilder.toString(); + } + + /** + * Gets detailed output about each argument type. + * @param argumentGroups Collection of program arguments sorted according to how they should be shown. + * @return Detailed text about all arguments. + */ + private String getDetailed( List argumentGroups ) { + StringBuilder builder = new StringBuilder(); + + for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) + builder.append( getDetailForGroup( argumentGroup ) ); + + return builder.toString(); + } + + /** + * Gets a detailed description for a given argument group. + * @param argumentDefinitionGroup The group of argument definitions to render. + * @return A string giving detailed info about the contents of this group. + */ + private String getDetailForGroup( ArgumentDefinitionGroup argumentDefinitionGroup ) { + if(argumentDefinitionGroup.allHidden()) + return ""; + + StringBuilder builder = new StringBuilder(); + Formatter formatter = new Formatter( builder ); + + if( argumentDefinitionGroup.groupName != null && argumentDefinitionGroup.argumentDefinitions.size() != 0 ) + builder.append( String.format("%nArguments for %s:%n", argumentDefinitionGroup.groupName ) ); + + List argumentDefinitions = new ArrayList(); + for(ArgumentDefinition argumentDefinition: argumentDefinitionGroup.argumentDefinitions) { + if(!argumentDefinition.isHidden) + argumentDefinitions.add(argumentDefinition); + } + + // Try to fit the entire argument definition across the screen, but impose an arbitrary cap of 3/4 * + // LINE_WIDTH in case the length of the arguments gets out of control. + int argWidth = Math.min( findLongestArgumentCallingInfo(argumentDefinitions), (TextFormattingUtils.DEFAULT_LINE_WIDTH*3)/4 - FIELD_SEPARATION_WIDTH ); + int docWidth = TextFormattingUtils.DEFAULT_LINE_WIDTH - argWidth - FIELD_SEPARATION_WIDTH; + + for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { + Iterator wordWrappedArgs = TextFormattingUtils.wordWrap( getArgumentCallingInfo(argumentDefinition), argWidth ).iterator(); + Iterator wordWrappedDoc = TextFormattingUtils.wordWrap( getArgumentDoc(argumentDefinition), docWidth ).iterator(); + + while( wordWrappedArgs.hasNext() || wordWrappedDoc.hasNext() ) { + String arg = wordWrappedArgs.hasNext() ? wordWrappedArgs.next() : ""; + String doc = wordWrappedDoc.hasNext() ? wordWrappedDoc.next() : ""; + + String formatString = "%-" + argWidth + "s%" + FIELD_SEPARATION_WIDTH + "s%s%n"; + formatter.format( formatString, arg, "", doc ); + } + } + + return builder.toString(); + } + + /** + * Gets a string indicating how this argument should be passed to the application. + * @param argumentDefinition Argument definition for which help should be printed. + * @return Calling information for this argument. + */ + private String getArgumentCallingInfo( ArgumentDefinition argumentDefinition ) { + StringBuilder builder = new StringBuilder(); + Formatter formatter = new Formatter( builder ); + + formatter.format(" "); + if( argumentDefinition.shortName != null ) + formatter.format("-%s,", argumentDefinition.shortName); + formatter.format("--%s", argumentDefinition.fullName); + if( !argumentDefinition.isFlag ) + formatter.format(" <%s>", argumentDefinition.fullName); + + return builder.toString(); + } + + /** + * Gets a string of argument documentation. + * @param argumentDefinition Argument definition for which help should be printed. + * @return Brief description for this argument. + */ + private String getArgumentDoc( ArgumentDefinition argumentDefinition ) { + StringBuilder builder = new StringBuilder(); + builder.append(argumentDefinition.doc); + if( argumentDefinition.validOptions != null ) { + builder.append(" ("); + builder.append(Utils.join("|",argumentDefinition.validOptions)); + builder.append(")"); + } + return builder.toString(); + } + + /** + * Crude implementation which finds the longest argument portion + * given a set of arguments. + * @param argumentDefinitions argument definitions to inspect. + * @return longest argument length. + */ + private int findLongestArgumentCallingInfo( Collection argumentDefinitions ) { + int longest = 0; + for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { + String argumentText = getArgumentCallingInfo( argumentDefinition ); + if( longest < argumentText.length() ) + longest = argumentText.length(); + } + return longest; + } + + /** + * Extract the argument definition groups from the argument definitions and arrange them appropriately. + * For help, we want the arguments sorted as they are declared in the class. However, required arguments + * should appear before optional arguments. + * @param argumentDefinitions Argument definitions from which to extract argument groups. + * @return A list of argument groups sorted in display order. + */ + private List prepareArgumentGroups( ArgumentDefinitions argumentDefinitions ) { + // Sort the list of argument definitions according to how they should be shown. + // Put the sorted results into a new cloned data structure. + Comparator definitionComparator = new Comparator() { + public int compare( ArgumentDefinition lhs, ArgumentDefinition rhs ) { + if( lhs.required && rhs.required ) return 0; + if( lhs.required ) return -1; + if( rhs.required ) return 1; + return 0; + } + }; + + List argumentGroups = new ArrayList(); + for( ArgumentDefinitionGroup argumentGroup: argumentDefinitions.getArgumentDefinitionGroups() ) { + List sortedDefinitions = new ArrayList( argumentGroup.argumentDefinitions ); + Collections.sort( sortedDefinitions, definitionComparator ); + argumentGroups.add( new ArgumentDefinitionGroup(argumentGroup.groupName,sortedDefinitions) ); + } + + // Sort the argument groups themselves with main arguments first, followed by plugins sorted in name order. + Comparator groupComparator = new Comparator() { + public int compare( ArgumentDefinitionGroup lhs, ArgumentDefinitionGroup rhs ) { + if( lhs.groupName == null && rhs.groupName == null ) return 0; + if( lhs.groupName == null ) return -1; + if( rhs.groupName == null ) return 1; + return lhs.groupName.compareTo(rhs.groupName); + } + }; + Collections.sort( argumentGroups, groupComparator ); + + + return argumentGroups; + } + + /** + * generateHeaderInformation + *

+ *

+ * Generate a standard header for the logger + * + * @param applicationDetails details of the application to run. + * @param parsedArgs the arguments passed in + */ + public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map parsedArgs) { + + DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); + java.util.Date date = new java.util.Date(); + + String barrier = createBarrier(applicationDetails.applicationHeader); + + logger.info(barrier); + for (String headerLine : applicationDetails.applicationHeader) + logger.info(headerLine); + logger.debug("Current directory: " + System.getProperty("user.dir")); + for (Map.Entry entry: parsedArgs.entrySet()) { + ArgumentMatchSource matchSource = entry.getKey(); + final String sourceName; + switch (matchSource.getType()) { + case CommandLine: sourceName = "Program"; break; + case Provider: sourceName = matchSource.getDescription(); break; + default: throw new RuntimeException("Unexpected argument match source type: " + matchSource.getType()); + } + + String output = sourceName + " Args: " + entry.getValue().getDescription(); + logger.info(output); + } + logger.info(generateUserHelpData()); + logger.info("Date/Time: " + dateFormat.format(date)); + logger.info(barrier); + + for(String attribution: applicationDetails.attribution) + logger.info(attribution); + logger.info(barrier); + } + + /** + * Create the user-related help information. + * @return a non-null, non-empty String with the relevant information. + */ + private static String generateUserHelpData() { + try { + return "Executing as " + + System.getProperty("user.name") + "@" + InetAddress.getLocalHost().getHostName() + + " on " + System.getProperty("os.name") + " " + System.getProperty("os.version") + + " " + System.getProperty("os.arch") + "; " + System.getProperty("java.vm.name") + + " " + System.getProperty("java.runtime.version") + "."; + } catch (Exception e) { + // don't fail + return ""; + } + } + + /** + * Create a barrier to use to distinguish the header from the rest of the output. + * @param text A collection of lines to output as part of a header. + * @return A barrier consisting of the '-' character. + */ + private static String createBarrier(List text) { + int barrierWidth = 0; + for(String headerLine: text) + barrierWidth = Math.max(headerLine.length(),barrierWidth); + return String.format("%0" + barrierWidth + "d",0).replace('0','-'); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/instrumentation/Sizeof.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/instrumentation/Sizeof.java diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalSetRule.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalSetRule.java diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/FileExtension.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/FileExtension.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/IOUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/IOUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/Resource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/Resource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/Resource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/Resource.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java new file mode 100644 index 000000000..c4b566582 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java @@ -0,0 +1,370 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.locusiterator; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Steps a single read along its alignment to the genome + * + * The logical model for generating extended events is as follows: the "record state" + * implements the traversal along the reference; thus stepForwardOnGenome() returns + * on every and only on actual reference bases. This can be a (mis)match or a deletion + * (in the latter case, we still return on every individual reference base the deletion spans). + * + * User: depristo + * Date: 1/5/13 + * Time: 1:08 PM + */ +@Invariant({ + "nCigarElements >= 0", + "cigar != null", + "read != null", + "currentCigarElementOffset >= -1", + "currentCigarElementOffset <= nCigarElements" +}) +public class AlignmentStateMachine { + /** + * Our read + */ + private final GATKSAMRecord read; + private final Cigar cigar; + private final int nCigarElements; + private int currentCigarElementOffset = -1; + + /** + * how far are we offset from the start of the read bases? + */ + private int readOffset; + + /** + * how far are we offset from the alignment start on the genome? + */ + private int genomeOffset; + + /** + * Our cigar element + */ + private CigarElement currentElement; + + /** + * how far are we into our cigarElement? + */ + private int offsetIntoCurrentCigarElement; + + @Requires({"read != null", "read.getAlignmentStart() != -1", "read.getCigar() != null"}) + public AlignmentStateMachine(final GATKSAMRecord read) { + this.read = read; + this.cigar = read.getCigar(); + this.nCigarElements = cigar.numCigarElements(); + initializeAsLeftEdge(); + } + + /** + * Initialize the state variables to put this machine one bp before the + * start of the alignment, so that a call to stepForwardOnGenome() will advance + * us to the first proper location + */ + @Ensures("isLeftEdge()") + private void initializeAsLeftEdge() { + readOffset = offsetIntoCurrentCigarElement = genomeOffset = -1; + currentElement = null; + } + + /** + * Get the read we are aligning to the genome + * @return a non-null GATKSAMRecord + */ + @Ensures("result != null") + public GATKSAMRecord getRead() { + return read; + } + + /** + * Get the reference index of the underlying read + * + * @return the reference index of the read + */ + @Ensures("result == getRead().getReferenceIndex()") + public int getReferenceIndex() { + return getRead().getReferenceIndex(); + } + + /** + * Is this the left edge state? I.e., one that is before or after the current read? + * @return true if this state is an edge state, false otherwise + */ + public boolean isLeftEdge() { + return readOffset == -1; + } + + /** + * Are we on the right edge? I.e., is the current state off the right of the alignment? + * @return true if off the right edge, false if otherwise + */ + public boolean isRightEdge() { + return readOffset == read.getReadLength(); + } + + /** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return the current read offset position. If an edge will be == -1 + */ + @Ensures("result >= -1") + public int getReadOffset() { + return readOffset; + } + + /** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return the current offset from the alignment start on the genome. If this state is + * at the left edge the result will be -1; + */ + @Ensures("result >= -1") + public int getGenomeOffset() { + return genomeOffset; + } + + /** + * Get the position (1-based as standard) of the current alignment on the genome w.r.t. the read's alignment start + * @return the position on the genome of the current state in absolute coordinates + */ + @Ensures("result > 0") + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } + + /** + * Gets #getGenomePosition but as a 1 bp GenomeLoc + * @param genomeLocParser the parser to use to create the genome loc + * @return a non-null genome location with start position of getGenomePosition + */ + @Requires("genomeLocParser != null") + @Ensures("result != null") + public GenomeLoc getLocation(final GenomeLocParser genomeLocParser) { + // TODO -- may return wonky results if on an edge (could be 0 or could be beyond genome location) + return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); + } + + /** + * Get the cigar element we're currently aligning with. + * + * For example, if the cigar string is 2M2D2M and we're in the second step of the + * first 2M, then this function returns the element 2M. After calling stepForwardOnGenome + * this function would return 2D. + * + * @return the cigar element, or null if we're the left edge + */ + @Ensures("result != null || isLeftEdge() || isRightEdge()") + public CigarElement getCurrentCigarElement() { + return currentElement; + } + + /** + * Get the offset of the current cigar element among all cigar elements in the read + * + * Suppose our read's cigar is 1M2D3M, and we're at the first 1M. This would + * return 0. Stepping forward puts us in the 2D, so our offset is 1. Another + * step forward would result in a 1 again (we're in the second position of the 2D). + * Finally, one more step forward brings us to 2 (for the 3M element) + * + * @return the offset of the current cigar element in the reads's cigar. Will return -1 for + * when the state is on the left edge, and be == the number of cigar elements in the + * read when we're past the last position on the genome + */ + @Ensures({"result >= -1", "result <= nCigarElements"}) + public int getCurrentCigarElementOffset() { + return currentCigarElementOffset; + } + + /** + * Get the offset of the current state into the current cigar element + * + * That is, suppose we have a read with cigar 2M3D4M, and we're right at + * the second M position. offsetIntoCurrentCigarElement would be 1, as + * it's two elements into the 2M cigar. Now stepping forward we'd be + * in cigar element 3D, and our offsetIntoCurrentCigarElement would be 0. + * + * @return the offset (from 0) of the current state in the current cigar element. + * Will be 0 on the right edge, and -1 on the left. + */ + @Ensures({"result >= 0 || (result == -1 && isLeftEdge())", "!isRightEdge() || result == 0"}) + public int getOffsetIntoCurrentCigarElement() { + return offsetIntoCurrentCigarElement; + } + + /** + * Convenience accessor of the CigarOperator of the current cigar element + * + * Robust to the case where we're on the edge, and currentElement is null, in which + * case this function returns null as well + * + * @return null if this is an edge state + */ + @Ensures("result != null || isLeftEdge() || isRightEdge()") + public CigarOperator getCigarOperator() { + return currentElement == null ? null : currentElement.getOperator(); + } + + @Override + public String toString() { + return String.format("%s ro=%d go=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, offsetIntoCurrentCigarElement, currentElement); + } + + // ----------------------------------------------------------------------------------------------- + // + // Code for setting up prev / next states + // + // ----------------------------------------------------------------------------------------------- + + /** + * Step the state machine forward one unit + * + * Takes the current state of this machine, and advances the state until the next on-genome + * cigar element (M, X, =, D) is encountered, at which point this function returns with the + * cigar operator of the current element. + * + * Assumes that the AlignmentStateMachine is in the left edge state at the start, so that + * stepForwardOnGenome() can be called to move the machine to the first alignment position. That + * is, the normal use of this code is: + * + * AlignmentStateMachine machine = new AlignmentStateMachine(read) + * machine.stepForwardOnGenome() + * // now the machine is at the first position on the genome + * + * When stepForwardOnGenome() advances off the right edge of the read, the state machine is + * left in a state such that isRightEdge() returns true and returns null, indicating the + * the machine cannot advance further. The machine may explode, though this is not contracted, + * if stepForwardOnGenome() is called after a previous call returned null. + * + * @return the operator of the cigar element that machine stopped at, null if we advanced off the end of the read + */ + @Ensures("result != null || isRightEdge()") + public CigarOperator stepForwardOnGenome() { + // loop until we either find a cigar element step that moves us one base on the genome, or we run + // out of cigar elements + while ( true ) { + // we enter this method with readOffset = index of the last processed base on the read + // (-1 if we did not process a single base yet); this can be last matching base, + // or last base of an insertion + if (currentElement == null || (offsetIntoCurrentCigarElement + 1) >= currentElement.getLength()) { + currentCigarElementOffset++; + if (currentCigarElementOffset < nCigarElements) { + currentElement = cigar.getCigarElement(currentCigarElementOffset); + offsetIntoCurrentCigarElement = -1; + // next line: guards against cigar elements of length 0; when new cigar element is retrieved, + // we reenter in order to re-check offsetIntoCurrentCigarElement against currentElement's length + continue; + } else { + if (currentElement != null && currentElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + + // we're done, so set the offset of the cigar to 0 for cleanliness, as well as the current element + offsetIntoCurrentCigarElement = 0; + readOffset = read.getReadLength(); + currentElement = null; + + // Reads that contain indels model the genomeOffset as the following base in the reference. Because + // we fall into this else block only when indels end the read, increment genomeOffset such that the + // current offset of this read is the next ref base after the end of the indel. This position will + // model a point on the reference somewhere after the end of the read. + genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: + + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + return null; + } + } + + offsetIntoCurrentCigarElement++; + boolean done = false; + switch (currentElement.getOperator()) { + case H: // ignore hard clips + case P: // ignore pads + offsetIntoCurrentCigarElement = currentElement.getLength(); + break; + case I: // insertion w.r.t. the reference + case S: // soft clip + offsetIntoCurrentCigarElement = currentElement.getLength(); + readOffset += currentElement.getLength(); + break; + case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + // should be the same as N case + genomeOffset++; + done = true; + break; + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + genomeOffset++; + done = true; + break; + case M: + case EQ: + case X: + readOffset++; + genomeOffset++; + done = true; + break; + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + currentElement.getOperator()); + } + + if ( done ) + return currentElement.getOperator(); + } + } + + /** + * Create a new PileupElement based on the current state of this element + * + * Must not be a left or right edge + * + * @return a pileup element + */ + @Ensures("result != null") + public final PileupElement makePileupElement() { + if ( isLeftEdge() || isRightEdge() ) + throw new IllegalStateException("Cannot make a pileup element from an edge alignment state"); + return new PileupElement(read, + getReadOffset(), + getCurrentCigarElement(), + getCurrentCigarElementOffset(), + getOffsetIntoCurrentCigarElement()); + } +} + diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java new file mode 100644 index 000000000..0ee08e560 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -0,0 +1,220 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +import static java.lang.Math.log10; +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * + * User: rpoplin, carneiro + * Date: 3/1/12 + */ +public class Log10PairHMM extends N2MemoryPairHMM { + /** + * Should we use exact log10 calculation (true), or an approximation (false)? + */ + private final boolean doExactLog10; + + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected final static double log10_3 = log10(3.0); + + /** + * Create an uninitialized PairHMM + * + * @param doExactLog10 should the log10 calculations be exact (slow) or approximate (faster) + */ + public Log10PairHMM(final boolean doExactLog10) { + this.doExactLog10 = doExactLog10; + } + + /** + * Is this HMM using exact log10 calculations? + * @return true if exact, false if approximate + */ + public boolean isDoingExactLog10Calculations() { + return doExactLog10; + } + + /** + * {@inheritDoc} + */ + @Override + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + for( int iii=0; iii < paddedMaxReadLength; iii++ ) { + Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); + } + } + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + + if ( ! constantsAreInitialized || recacheReadValues ) + initializeProbabilities(insertionGOP, deletionGOP, overallGCP); + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + initializeMatrixValues(haplotypeBases); + } + + for (int i = 1; i < paddedReadLength; i++) { + // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + return finalLikelihoodCalculation(); + } + + protected void initializeMatrixValues(final byte[] haplotypeBases) { + final double initialValue = Math.log10(1.0 / haplotypeBases.length); + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; + } + } + + protected double finalLikelihoodCalculation() { + final int endI = paddedReadLength - 1; + double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); + for (int j = 2; j < paddedHaplotypeLength; j++) + finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); + return finalSumProbabilities; + } + + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + PairHMMModel.qualToTransProbsLog10(transition,insertionGOP,deletionGOP,overallGCP); + // note that we initialized the constants + constantsAreInitialized = true; + } + + + /** + * Compute the log10SumLog10 of the values + * + * NOTE NOTE NOTE + * + * Log10PairHMM depends critically on this function tolerating values that are all -Infinity + * and the sum returning -Infinity. Note good. Needs to be fixed. + * + * NOTE NOTE NOTE + * + * @param values an array of log10 probabilities that need to be summed + * @return the log10 of the sum of the probabilities + */ + @Requires("values != null") + protected double myLog10SumLog10(final double[] values) { + return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { + + matchMatrix[indI][indJ] = prior + + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], + insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch], + deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]}); + insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]}); + deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]}); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java new file mode 100644 index 000000000..057c67a55 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Requires; + +/** + * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix + * + * User: rpoplin + * Date: 10/16/12 + */ +abstract class N2MemoryPairHMM extends PairHMM { + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + protected double[][] matchMatrix = null; + protected double[][] insertionMatrix = null; + protected double[][] deletionMatrix = null; + + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + + public void doNotUseTristateCorrection() { + doNotUseTristateCorrection = true; + } + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + transition = PairHMMModel.createTransitionMatrix(maxReadLength); + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + } + + /** + * Print out the core hmm matrices for debugging + */ + protected void dumpMatrices() { + dumpMatrix("matchMetricArray", matchMatrix); + dumpMatrix("insertionMatrix", insertionMatrix); + dumpMatrix("deletionMatrix", deletionMatrix); + } + + /** + * Print out in a human readable form the matrix for debugging + * @param name the name of this matrix + * @param matrix the matrix of values + */ + @Requires({"name != null", "matrix != null"}) + private void dumpMatrix(final String name, final double[][] matrix) { + System.out.printf("%s%n", name); + for ( int i = 0; i < matrix.length; i++) { + System.out.printf("\t%s[%d]", name, i); + for ( int j = 0; j < matrix[i].length; j++ ) { + if ( Double.isInfinite(matrix[i][j]) ) + System.out.printf(" %15s", String.format("%f", matrix[i][j])); + else + System.out.printf(" % 15.5e", matrix[i][j]); + } + System.out.println(); + } + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..5762b33ba --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,273 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * + * User: rpoplin + * Date: 10/16/12 + */ +public abstract class PairHMM { + protected final static Logger logger = Logger.getLogger(PairHMM.class); + + protected boolean constantsAreInitialized = false; + + protected byte[] previousHaplotypeBases; + protected int hapStartIndex; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING, + /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ + ARRAY_LOGLESS + } + + protected int maxHaplotypeLength, maxReadLength; + protected int paddedMaxReadLength, paddedMaxHaplotypeLength; + protected int paddedReadLength, paddedHaplotypeLength; + protected boolean initialized = false; + + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); + if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); + + maxHaplotypeLength = haplotypeMaxLength; + maxReadLength = readMaxLength; + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + paddedMaxReadLength = readMaxLength + 1; + paddedMaxHaplotypeLength = haplotypeMaxLength + 1; + + previousHaplotypeBases = null; + + constantsAreInitialized = false; + initialized = true; + } + + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Map haplotypeMap) { + int listMaxHaplotypeLength = 0; + for( final Allele a: haplotypeMap.keySet() ) { + final Haplotype h = haplotypeMap.get(a); + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param reads the list of reads + * @param alleleHaplotypeMap the list of haplotypes + * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. + * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of + * said read coming from the said haplotype under the provided error model + */ + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { + + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(reads); + final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for(GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + byte[] currentHaplotypeBases = null; + boolean isFirstHaplotype = true; + Allele currentAllele = null; + double log10l; + for (final Allele allele : alleleHaplotypeMap.keySet()){ + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + final byte[] nextHaplotypeBases = haplotype.getBases(); + if (currentHaplotypeBases != null) { + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + likelihoodMap.add(read, currentAllele, log10l); + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = allele; + } + // process the final haplotype + if (currentHaplotypeBases != null) { + + // there is no next haplotype, so pass null for nextHaplotypeBases. + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + likelihoodMap.add(read, currentAllele, log10l); + } + } + return likelihoodMap; + } + + /** + * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion + * probabilities. + * + * Note on using hapStartIndex. This allows you to compute the exact true likelihood of a full haplotypes + * given a read, assuming that the previous calculation read over a full haplotype, recaching the read values, + * starting only at the place where the new haplotype bases and the previous haplotype bases different. This + * index is 0-based, and can be computed with findFirstPositionWhereHaplotypesDiffer given the two haplotypes. + * Note that this assumes that the read and all associated quals values are the same. + * + * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length + * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length + * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases + * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases + * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases + * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases + * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated + * parameters are the same, and only the haplotype bases are changing underneath us + * @return the log10 probability of read coming from the haplotype under the provided error model + */ + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final boolean recacheReadValues, + final byte[] nextHaploytpeBases) { + + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); + if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); + if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); + if ( readBases == null ) throw new IllegalArgumentException("readBases cannot be null"); + if ( readBases.length > maxReadLength ) throw new IllegalArgumentException("readBases is too long, got " + readBases.length + " but max is " + maxReadLength); + if ( readQuals.length != readBases.length ) throw new IllegalArgumentException("Read bases and read quals aren't the same size: " + readBases.length + " vs " + readQuals.length); + if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); + if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); + if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); + + paddedReadLength = readBases.length + 1; + paddedHaplotypeLength = haplotypeBases.length + 1; + + hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; + + // Pre-compute the difference between the current haplotype and the next one to be run + // Looking ahead is necessary for the ArrayLoglessPairHMM implementation + final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); + + double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); + + if ( result > 0.0) + throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f, PairHMM: %s", new String(haplotypeBases), new String(readBases), result, this.getClass().getSimpleName())); + else if (!MathUtils.goodLog10Probability(result)) + throw new IllegalStateException("Invalid Log Probability: " + result); + + // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). + // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. + previousHaplotypeBases = haplotypeBases; + + // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype + // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart + hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; + + return result; + } + + /** + * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 + */ + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) + protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex); + + /** + * Compute the first position at which two haplotypes differ + * + * If the haplotypes are exact copies of each other, returns the min length of the two haplotypes. + * + * @param haplotype1 the first haplotype1 + * @param haplotype2 the second haplotype1 + * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same + */ + public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { + if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); + if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); + + for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { + if( haplotype1[iii] != haplotype2[iii] ) { + return iii; + } + } + + return Math.min(haplotype1.length, haplotype2.length); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModel.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModel.java new file mode 100644 index 000000000..551be676a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModel.java @@ -0,0 +1,435 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Helper class that implement calculations required to implement the PairHMM Finite State Automation (FSA) model. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMModel { + + + /** + * Prevents instantiation of this class + */ + private PairHMMModel() { + + } + + /** + * Length of the standard transition probability array. + */ + public static final int TRANS_PROB_ARRAY_LENGTH = 6; + + /** + * Position in the transition probability array for the Match-to-Match transition. + */ + public static final int matchToMatch = 0; + + /** + * Position in the transition probability array for the Indel-to-Match transition. + */ + public static final int indelToMatch = 1; + + /** + * Position in the transition probability array for the Match-to-Insertion transition. + */ + public static final int matchToInsertion = 2; + + /** + * Position in the transition probability array for the Insertion-to-Insertion transition. + */ + public static final int insertionToInsertion = 3; + + /** + * Position in the transition probability array for the Match-to-Deletion transition. + */ + public static final int matchToDeletion = 4; + + /** + * Position in the transition probability array for the Deletion-to-Deletion transition. + */ + public static final int deletionToDeletion = 5; + + /** + * Convenient ln10 constant. + */ + private static double LN10 = Math.log(10); + + /** + * Convenient (ln10)^-1 constant. + */ + private static double INV_LN10 = 1.0 / LN10; + + /** + * Holds pre-calculated the matchToMath probability values in linear scale. + * + *

+ * This is a triangular matrix stored in a unidimentional array like so: + *

+ * (0,0), (0,1), (1,1), (0,2), (1,2), (2,2), (0,3) ... ({@link QualityUtils#MAX_QUAL},{@link QualityUtils#MAX_QUAL}) + */ + private static double[] matchToMatchProb = new double[((QualityUtils.MAX_QUAL + 1) * (QualityUtils.MAX_QUAL + 2)) >> 1]; + + /** + * Holds pre-calculated the matchToMath probability values in log10 scale. + * + *

+ * This is a triangular matrix stored in a unidimentional array like so: + *

+ * (0,0), (0,1), (1,1), (0,2), (1,2), (2,2), (0,3) ... ({@link QualityUtils#MAX_QUAL},{@link QualityUtils#MAX_QUAL}) + */ + private static double[] matchToMatchLog10 = new double[((QualityUtils.MAX_QUAL + 1) * (QualityUtils.MAX_QUAL + 2)) >> 1]; + + /** + * Initialize matchToMatch cache tables {@link #matchToMatch} and {@link #matchToMatchLog10} + */ + static { + for (int i = 0, offset = 0; i <= QualityUtils.MAX_QUAL; offset += ++i) + for (int j = 0; j <= i; j++) { + final double log10Sum = MathUtils.approximateLog10SumLog10(-0.1 * i,-0.1 * j); + matchToMatchLog10[offset + j] = + Math.log1p( - Math.min(1,Math.pow(10,log10Sum))) * INV_LN10; + matchToMatchProb[offset + j] = Math.pow(10,matchToMatchLog10[offset + j]); + } + } + + /** + * Fills a transition probability array given the different quality scores affecting a read site + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + */ + public static void qualToTransProbs(final double[] dest, final byte insQual, final byte delQual, final byte gcp) { + if (insQual < 0) throw new IllegalArgumentException("insert quality cannot less than 0: " + insQual); + if (delQual < 0) throw new IllegalArgumentException("deletion quality cannot be less than 0: " + delQual); + if (gcp < 0) throw new IllegalArgumentException("gcp cannot be less than 0: " + gcp); + dest[matchToMatch] = matchToMatchProb(insQual, delQual); + dest[matchToInsertion] = QualityUtils.qualToErrorProb(insQual); + dest[matchToDeletion] = QualityUtils.qualToErrorProb(delQual); + dest[indelToMatch] = QualityUtils.qualToProb(gcp); + dest[insertionToInsertion] = dest[deletionToDeletion] = QualityUtils.qualToErrorProb(gcp); + } + + /** + * Returns a transition probability array given the different quality scores affecting a read site. + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + * + * @return never {@code null}. An array of length {@link #TRANS_PROB_ARRAY_LENGTH}. + */ + @SuppressWarnings("unused") + public static double[] qualToTransProbs(final byte insQual, final byte delQual, final byte gcp) { + final double[] dest = new double[TRANS_PROB_ARRAY_LENGTH]; + qualToTransProbs(dest,insQual,delQual,gcp); + return dest; + } + + /** + * Fills ax matrix with the transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param dest the matrix to update + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays, matrices is {@code null} or any entry in {@code dest} is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * @throws ArrayIndexOutOfBoundsException if {@code dest} or any of its elements is not large enough to contain the + * transition matrix. + */ + @SuppressWarnings("unused") + public static void qualToTransProbs(final double[][] dest, final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final int readLength = insQuals.length; + if (delQuals.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + delQuals.length); + if (gcps.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + gcps.length); + + if (dest.length < readLength + 1) throw new IllegalArgumentException("destination length is not enough for the read length: " + dest.length + " < " + readLength + " + 1"); + + for (int i = 0; i < readLength; i++) + qualToTransProbs(dest[i + 1], insQuals[i], delQuals[i], gcps[i]); + } + + /** + * Returns a matrix with the transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * + * @return never {@code null}, an matrix of the dimensions explained above. + */ + @SuppressWarnings("unused") + public static double[][] qualToTransProbs(final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final double[][] dest = createTransitionMatrix(insQuals.length); + qualToTransProbs(dest,insQuals,delQuals,gcps); + return dest; + } + + /** + * Fills a transition log10 probability array given the different quality scores affecting a read site. + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + */ + public static void qualToTransProbsLog10(final double[] dest, final byte insQual, final byte delQual, final byte gcp) { + if (insQual < 0) throw new IllegalArgumentException("insert quality cannot less than 0: " + insQual); + if (delQual < 0) throw new IllegalArgumentException("deletion quality cannot be less than 0: " + delQual); + if (gcp < 0) throw new IllegalArgumentException("gcp cannot be less than 0: " + gcp); + dest[matchToMatch] = matchToMatchProbLog10(insQual, delQual); + dest[matchToInsertion] = QualityUtils.qualToErrorProbLog10(insQual); + dest[matchToDeletion] = QualityUtils.qualToErrorProbLog10(delQual); + dest[indelToMatch] = QualityUtils.qualToProbLog10(gcp); + dest[insertionToInsertion] = dest[deletionToDeletion] = QualityUtils.qualToErrorProbLog10(gcp); + } + + /** + * Returns a transition log10 probability array given the different quality scores affecting a read site. + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + * + * @return never {@code null}. An array of length {@link #TRANS_PROB_ARRAY_LENGTH}. + */ + @SuppressWarnings("unused") + public static double[] qualToTransProbsLog10(final byte insQual, final byte delQual, final byte gcp) { + final double[] dest = new double[TRANS_PROB_ARRAY_LENGTH]; + qualToTransProbsLog10(dest,insQual,delQual,gcp); + return dest; + } + + /** + * Fills a matrix with the log10 transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays, matrices is {@code null} or any entry in {@code dest} is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * @throws ArrayIndexOutOfBoundsException if {@code dest} or any of its elements is not large enough to contain the + * transition matrix. + */ + @SuppressWarnings("unused") + public static void qualToTransProbsLog10(final double[][] dest, final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final int readLength = insQuals.length; + if (delQuals.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + delQuals.length); + if (gcps.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + gcps.length); + if (dest.length < readLength + 1) throw new IllegalArgumentException("destination length is not enough for the read length: " + dest.length + " < " + readLength + " + 1"); + + for (int i = 0; i < readLength; i++) + qualToTransProbsLog10(dest[i+1],insQuals[i],delQuals[i],gcps[i]); + } + + /** + * Returns a matrix with the log10 transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * + * @return never {@code null}, an matrix of the dimensions explained above. + */ + @SuppressWarnings("unused") + public static double[][] qualToTransProbsLog10(final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final double[][] dest = createTransitionMatrix(insQuals.length); + qualToTransProbsLog10(dest,insQuals,delQuals,gcps); + return dest; + } + + /** + * Creates a transition probability matrix large enough to work with sequences of a particular length. + * + * @param maxReadLength the maximum read length for the transition matrix. + * + * @return never {@code null}. A matrix of {@code maxReadLength + 1} by {@link #TRANS_PROB_ARRAY_LENGTH} positions. + */ + public static double[][] createTransitionMatrix(final int maxReadLength) { + return new double[maxReadLength + 1][TRANS_PROB_ARRAY_LENGTH]; + } + + /** + * Returns the probability that neither of two event takes place. + *

+ * + * We assume that both event never occur together and that delQual is the conditional probability + * (qual. encoded) of the second event, given the first event didn't took place. So that the + * probability of no event is:
+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of the first event. + * @param delQual PhRED scaled quality/probability of the second event. + * + * @return a value between 0 and 1. + */ + public static double matchToMatchProb(final byte insQual, final byte delQual) { + return matchToMatchProb((insQual & 0xFF), (delQual & 0xFF)); + } + + /** + * Returns the probability (log 10 scaled) that neither of two event, insertion and deletion, takes place. + *

+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of an insertion. + * @param delQual PhRED scaled quality/probability of a deletion. + * + * @return a value between 0 and -Inf. + */ + public static double matchToMatchProbLog10(final byte insQual, final byte delQual) { + return matchToMatchProbLog10((insQual & 0xFF), (delQual & 0xFF)); + } + + /** + * Returns the probability that neither of two events, insertion and deletion, takes place. + *

+ * + * We assume that both event never occur together and that delQual is the conditional probability + * (qual. encoded) of the second event, given the first event didn't took place. So that the + * probability of no event is:
+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of an insertion. + * @param delQual PhRED scaled quality/probability of a deletion. + * @return a value between 0 and 1. + */ + public static double matchToMatchProb(final int insQual, final int delQual) { + final int minQual; + final int maxQual; + if (insQual <= delQual) { + minQual = insQual; + maxQual = delQual; + } else { + minQual = delQual; + maxQual = insQual; + } + + if (minQual < 0) throw new IllegalArgumentException("quality cannot be negative: " + minQual + " and " + maxQual); + + return (QualityUtils.MAX_QUAL < maxQual) ? 1.0 - Math.pow(10, MathUtils.approximateLog10SumLog10(-0.1 * minQual, -0.1 * maxQual)) : + matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; + } + + /** + * Returns the probability (log 10 scaled) that neither of two event takes place. + *

+ * + * We assume that both event never occur together and that delQual is the conditional probability (qual. encoded) + * of the second event, given the first event didn't took place. So that the probability of no event is:
+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of an insertion. + * @param delQual PhRED scaled quality/probability of a deletion. + * + * @return a value between 0 and -Inf. + */ + public static double matchToMatchProbLog10(final int insQual, final int delQual) { + final int minQual; + final int maxQual; + if (insQual <= delQual) { + minQual = insQual; + maxQual = delQual; + } else { + minQual = delQual; + maxQual = insQual; + } + return (QualityUtils.MAX_QUAL < maxQual) ? Math.log1p ( + - Math.min(1,Math.pow(10, + MathUtils.approximateLog10SumLog10(-.1 * minQual, -.1 * maxQual)))) * INV_LN10 : + matchToMatchLog10[((maxQual * (maxQual + 1)) >> 1) + minQual]; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java new file mode 100644 index 000000000..42cfc9492 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -0,0 +1,539 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pileup; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; +import java.util.EnumSet; +import java.util.LinkedList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 14, 2009 + * Time: 8:54:05 AM + */ +public class PileupElement implements Comparable { + private final static LinkedList EMPTY_LINKED_LIST = new LinkedList<>(); + + private final static EnumSet ON_GENOME_OPERATORS = + EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); + + public static final byte DELETION_BASE = BaseUtils.Base.D.base; + public static final byte DELETION_QUAL = (byte) 16; + public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; + public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88; + public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89; + public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90; + + protected final GATKSAMRecord read; // the read this base belongs to + protected final int offset; // the offset in the bases array for this base + + private final CigarElement currentCigarElement; + private final int currentCigarOffset; + private final int offsetInCurrentCigar; + + /** + * Create a new pileup element + * + * @param read a non-null read to pileup + * @param baseOffset the offset into the read's base / qual vector aligned to this position on the genome. If the + * current cigar element is a deletion, offset should be the offset of the last M/=/X position. + * @param currentElement a non-null CigarElement that indicates the cigar element aligning the read to the genome + * @param currentCigarOffset the offset of currentElement in read.getCigar().getElement(currentCigarOffset) == currentElement) + * @param offsetInCurrentCigar how far into the currentElement are we in our alignment to the genome? + */ + public PileupElement(final GATKSAMRecord read, final int baseOffset, + final CigarElement currentElement, final int currentCigarOffset, + final int offsetInCurrentCigar) { + assert currentElement != null; + + this.read = read; + this.offset = baseOffset; + this.currentCigarElement = currentElement; + this.currentCigarOffset = currentCigarOffset; + this.offsetInCurrentCigar = offsetInCurrentCigar; + + // for performance regions these are assertions + assert this.read != null; + assert this.offset >= 0 && this.offset < this.read.getReadLength(); + assert this.currentCigarOffset >= 0; + assert this.currentCigarOffset < read.getCigarLength(); + assert this.offsetInCurrentCigar >= 0; + assert this.offsetInCurrentCigar < currentElement.getLength(); + } + + /** + * Create a new PileupElement that's a copy of toCopy + * @param toCopy the element we want to copy + */ + public PileupElement(final PileupElement toCopy) { + this(toCopy.read, toCopy.offset, toCopy.currentCigarElement, toCopy.currentCigarOffset, toCopy.offsetInCurrentCigar); + } + + /** + * Is this element a deletion w.r.t. the reference genome? + * + * @return true if this is a deletion, false otherwise + */ + public boolean isDeletion() { + return currentCigarElement.getOperator() == CigarOperator.D; + } + + /** + * Is the current element immediately before a deletion, but itself not a deletion? + * + * Suppose we are aligning a read with cigar 3M2D1M. This function is true + * if we are in the last cigar position of the 3M, but not if we are in the 2D itself. + * + * @return true if the next alignment position is a deletion w.r.t. the reference genome + */ + public boolean isBeforeDeletionStart() { + return ! isDeletion() && atEndOfCurrentCigar() && hasOperator(getNextOnGenomeCigarElement(), CigarOperator.D); + } + + /** + * Is the current element immediately after a deletion, but itself not a deletion? + * + * Suppose we are aligning a read with cigar 1M2D3M. This function is true + * if we are in the first cigar position of the 3M, but not if we are in the 2D itself or + * in any but the first position of the 3M. + * + * @return true if the previous alignment position is a deletion w.r.t. the reference genome + */ + public boolean isAfterDeletionEnd() { + return ! isDeletion() && atStartOfCurrentCigar() && hasOperator(getPreviousOnGenomeCigarElement(), CigarOperator.D); + } + + /** + * Get the read for this pileup element + * @return a non-null GATKSAMRecord + */ + @Ensures("result != null") + public GATKSAMRecord getRead() { + return read; + } + + /** + * Get the offset of the this element into the read that aligns that read's base to this genomic position. + * + * If the current element is a deletion then offset is the offset of the last base containing offset. + * + * @return a valid offset into the read's bases + */ + @Ensures({"result >= 0", "result <= read.getReadLength()"}) + public int getOffset() { + return offset; + } + + /** + * Get the base aligned to the genome at this location + * + * If the current element is a deletion returns DELETION_BASE + * + * @return a base encoded as a byte + */ + @Ensures("result != DELETION_BASE || (isDeletion() && result == DELETION_BASE)") + public byte getBase() { + return isDeletion() ? DELETION_BASE : read.getReadBases()[offset]; + } + + @Deprecated + public int getBaseIndex() { + return BaseUtils.simpleBaseToBaseIndex(getBase()); + } + + /** + * Get the base quality score of the base at this aligned position on the genome + * @return a phred-scaled quality score as a byte + */ + public byte getQual() { + return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; + } + + /** + * Get the Base Insertion quality at this pileup position + * @return a phred-scaled quality score as a byte + */ + public byte getBaseInsertionQual() { + return isDeletion() ? DELETION_QUAL : read.getBaseInsertionQualities()[offset]; + } + + /** + * Get the Base Deletion quality at this pileup position + * @return a phred-scaled quality score as a byte + */ + public byte getBaseDeletionQual() { + return isDeletion() ? DELETION_QUAL : read.getBaseDeletionQualities()[offset]; + } + + /** + * Get the length of an immediately following insertion or deletion event, or 0 if no such event exists + * + * Only returns a positive value when this pileup element is immediately before an indel. Being + * immediately before a deletion means that this pileup element isn't an deletion, and that the + * next genomic alignment for this read is a deletion. For the insertion case, this means + * that an insertion cigar occurs immediately after this element, between this one and the + * next genomic position. + * + * Note this function may be expensive, so multiple uses should be cached by the caller + * + * @return length of the event (number of inserted or deleted bases), or 0 + */ + @Ensures("result >= 0") + public int getLengthOfImmediatelyFollowingIndel() { + final CigarElement element = getNextIndelCigarElement(); + return element == null ? 0 : element.getLength(); + } + + /** + * Helpful function to get the immediately following cigar element, for an insertion or deletion + * + * if this state precedes a deletion (i.e., next position on genome) or insertion (immediately between + * this and the next position) returns the CigarElement corresponding to this event. Otherwise returns + * null. + * + * @return a CigarElement, or null if the next alignment state ins't an insertion or deletion. + */ + private CigarElement getNextIndelCigarElement() { + if ( isBeforeDeletionStart() ) { + final CigarElement element = getNextOnGenomeCigarElement(); + if ( element == null || element.getOperator() != CigarOperator.D ) + throw new IllegalStateException("Immediately before deletion but the next cigar element isn't a deletion " + element); + return element; + } else if ( isBeforeInsertion() ) { + final CigarElement element = getBetweenNextPosition().get(0); + if ( element.getOperator() != CigarOperator.I ) + throw new IllegalStateException("Immediately before insertion but the next cigar element isn't an insertion " + element); + return element; + } else { + return null; + } + } + + /** + * Get the bases for an insertion that immediately follows this alignment state, or null if none exists + * + * @see #getLengthOfImmediatelyFollowingIndel() for details on the meaning of immediately. + * + * If the immediately following state isn't an insertion, returns null + * + * @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + */ + @Ensures("result == null || result.length() == getLengthOfImmediatelyFollowingIndel()") + public String getBasesOfImmediatelyFollowingInsertion() { + final CigarElement element = getNextIndelCigarElement(); + if ( element != null && element.getOperator() == CigarOperator.I ) { + final int getFrom = offset + 1; + final byte[] bases = Arrays.copyOfRange(read.getReadBases(), getFrom, getFrom + element.getLength()); + return new String(bases); + } else + return null; + } + + /** + * Get the mapping quality of the read of this element + * @return the mapping quality of the underlying SAM record + */ + public int getMappingQual() { + return read.getMappingQuality(); + } + + @Ensures("result != null") + public String toString() { + return String.format("%s @ %d = %c Q%d", getRead().getReadName(), getOffset(), (char) getBase(), getQual()); + } + + @Override + public int compareTo(final PileupElement pileupElement) { + if (offset < pileupElement.offset) + return -1; + else if (offset > pileupElement.offset) + return 1; + else if (read.getAlignmentStart() < pileupElement.read.getAlignmentStart()) + return -1; + else if (read.getAlignmentStart() > pileupElement.read.getAlignmentStart()) + return 1; + else + return 0; + } + + // -------------------------------------------------------------------------- + // + // Reduced read accessors + // + // -------------------------------------------------------------------------- + + /** + * Get the cigar element aligning this element to the genome + * @return a non-null CigarElement + */ + @Ensures("result != null") + public CigarElement getCurrentCigarElement() { + return currentCigarElement; + } + + /** + * Get the offset of this cigar element in the Cigar of the current read (0-based) + * + * Suppose the cigar is 1M2D3I4D. If we are in the 1M state this function returns + * 0. If we are in 2D, the result is 1. If we are in the 4D, the result is 3. + * + * @return an offset into the read.getCigar() that brings us to the current cigar element + */ + public int getCurrentCigarOffset() { + return currentCigarOffset; + } + + /** + * Get the offset into the *current* cigar element for this alignment position + * + * We can be anywhere from offset 0 (first position) to length - 1 of the current + * cigar element aligning us to this genomic position. + * + * @return a valid offset into the current cigar element + */ + @Ensures({"result >= 0", "result < getCurrentCigarElement().getLength()"}) + public int getOffsetInCurrentCigar() { + return offsetInCurrentCigar; + } + + /** + * Get the cigar elements that occur before the current position but after the previous position on the genome + * + * For example, if we are in the 3M state of 1M2I3M state then 2I occurs before this position. + * + * Note that this function does not care where we are in the current cigar element. In the previous + * example this list of elements contains the 2I state regardless of where you are in the 3M. + * + * Note this returns the list of all elements that occur between this and the prev site, so for + * example we might have 5S10I2M and this function would return [5S, 10I]. + * + * @return a non-null list of CigarElements + */ + @Ensures("result != null") + public LinkedList getBetweenPrevPosition() { + return atStartOfCurrentCigar() ? getBetween(Direction.PREV) : EMPTY_LINKED_LIST; + } + + /** + * Get the cigar elements that occur after the current position but before the next position on the genome + * + * @see #getBetweenPrevPosition() for more details + * + * @return a non-null list of CigarElements + */ + @Ensures("result != null") + public LinkedList getBetweenNextPosition() { + return atEndOfCurrentCigar() ? getBetween(Direction.NEXT) : EMPTY_LINKED_LIST; + } + + /** for some helper functions */ + private enum Direction { PREV, NEXT } + + /** + * Helper function to get cigar elements between this and either the prev or next genomic position + * + * @param direction PREVIOUS if we want before, NEXT if we want after + * @return a non-null list of cigar elements between this and the neighboring position in direction + */ + @Ensures("result != null") + private LinkedList getBetween(final Direction direction) { + final int increment = direction == Direction.NEXT ? 1 : -1; + LinkedList elements = null; + final int nCigarElements = read.getCigarLength(); + for ( int i = currentCigarOffset + increment; i >= 0 && i < nCigarElements; i += increment) { + final CigarElement elt = read.getCigar().getCigarElement(i); + if ( ON_GENOME_OPERATORS.contains(elt.getOperator()) ) + break; + else { + // optimization: don't allocate list if not necessary + if ( elements == null ) + elements = new LinkedList(); + + if ( increment > 0 ) + // to keep the list in the right order, if we are incrementing positively add to the end + elements.add(elt); + else + // counting down => add to front + elements.addFirst(elt); + } + } + + // optimization: elements is null because nothing got added, just return the empty list + return elements == null ? EMPTY_LINKED_LIST : elements; + } + + /** + * Get the cigar element of the previous genomic aligned position + * + * For example, we might have 1M2I3M, and be sitting at the someone in the 3M. This + * function would return 1M, as the 2I isn't on the genome. Note this function skips + * all of the positions that would occur in the current element. So the result + * is always 1M regardless of whether we're in the first, second, or third position of the 3M + * cigar. + * + * @return a CigarElement, or null (indicating that no previous element exists) + */ + @Ensures("result == null || ON_GENOME_OPERATORS.contains(result.getOperator())") + public CigarElement getPreviousOnGenomeCigarElement() { + return getNeighboringOnGenomeCigarElement(Direction.PREV); + } + + /** + * Get the cigar element of the next genomic aligned position + * + * @see #getPreviousOnGenomeCigarElement() for more details + * + * @return a CigarElement, or null (indicating that no next element exists) + */ + @Ensures("result == null || ON_GENOME_OPERATORS.contains(result.getOperator())") + public CigarElement getNextOnGenomeCigarElement() { + return getNeighboringOnGenomeCigarElement(Direction.NEXT); + } + + /** + * Helper function to get the cigar element of the next or previous genomic position + * @param direction the direction to look in + * @return a CigarElement, or null if no such element exists + */ + @Ensures("result == null || ON_GENOME_OPERATORS.contains(result.getOperator())") + private CigarElement getNeighboringOnGenomeCigarElement(final Direction direction) { + final int increment = direction == Direction.NEXT ? 1 : -1; + final int nCigarElements = read.getCigarLength(); + + for ( int i = currentCigarOffset + increment; i >= 0 && i < nCigarElements; i += increment) { + final CigarElement elt = read.getCigar().getCigarElement(i); + if ( ON_GENOME_OPERATORS.contains(elt.getOperator()) ) + return elt; + } + + // getting here means that you didn't find anything + return null; + } + + /** + * Does the cigar element (which may be null) have operation toMatch? + * + * @param maybeCigarElement a CigarElement that might be null + * @param toMatch a CigarOperator we want to match against the one in maybeCigarElement + * @return true if maybeCigarElement isn't null and has operator toMatch + */ + @Requires("toMatch != null") + private boolean hasOperator(final CigarElement maybeCigarElement, final CigarOperator toMatch) { + return maybeCigarElement != null && maybeCigarElement.getOperator() == toMatch; + } + + /** + * Does an insertion occur immediately before the current position on the genome? + * + * @return true if yes, false if no + */ + public boolean isAfterInsertion() { return isAfter(getBetweenPrevPosition(), CigarOperator.I); } + + /** + * Does an insertion occur immediately after the current position on the genome? + * + * @return true if yes, false if no + */ + public boolean isBeforeInsertion() { return isBefore(getBetweenNextPosition(), CigarOperator.I); } + + /** + * Does a soft-clipping event occur immediately before the current position on the genome? + * + * @return true if yes, false if no + */ + public boolean isAfterSoftClip() { return isAfter(getBetweenPrevPosition(), CigarOperator.S); } + + /** + * Does a soft-clipping event occur immediately after the current position on the genome? + * + * @return true if yes, false if no + */ + public boolean isBeforeSoftClip() { return isBefore(getBetweenNextPosition(), CigarOperator.S); } + + /** + * Does a soft-clipping event occur immediately before or after the current position on the genome? + * + * @return true if yes, false if no + */ + public boolean isNextToSoftClip() { return isAfterSoftClip() || isBeforeSoftClip(); } + + /** + * Is the current position at the end of the current cigar? + * + * For example, if we are in element 3M, this function returns true if we are at offsetInCurrentCigar + * of 2, but not 0 or 1. + * + * @return true if we're at the end of the current cigar + */ + public boolean atEndOfCurrentCigar() { + return offsetInCurrentCigar == currentCigarElement.getLength() - 1; + } + + /** + * Is the current position at the start of the current cigar? + * + * For example, if we are in element 3M, this function returns true if we are at offsetInCurrentCigar + * of 0, but not 1 or 2. + * + * @return true if we're at the start of the current cigar + */ + public boolean atStartOfCurrentCigar() { + return offsetInCurrentCigar == 0; + } + + /** + * Is op the last element in the list of elements? + * + * @param elements the elements to examine + * @param op the op we want the last element's op to equal + * @return true if op == last(elements).op + */ + @Requires({"elements != null", "op != null"}) + private boolean isAfter(final LinkedList elements, final CigarOperator op) { + return ! elements.isEmpty() && elements.peekLast().getOperator() == op; + } + + /** + * Is op the first element in the list of elements? + * + * @param elements the elements to examine + * @param op the op we want the last element's op to equal + * @return true if op == first(elements).op + */ + @Requires({"elements != null", "op != null"}) + private boolean isBefore(final List elements, final CigarOperator op) { + return ! elements.isEmpty() && elements.get(0).getOperator() == op; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java new file mode 100644 index 000000000..6ccf74e4e --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -0,0 +1,1040 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.*; + +public class ReadBackedPileupImpl implements ReadBackedPileup { + protected final GenomeLoc loc; + protected final PileupElementTracker pileupElementTracker; + + private final static int UNINITIALIZED_CACHED_INT_VALUE = -1; + + /** + * Different then number of elements due to reduced reads + */ + private int depthOfCoverage = UNINITIALIZED_CACHED_INT_VALUE; + private int nDeletions = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of deletions + private int nMQ0Reads = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of MQ0 reads + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This pileup will contain a list, in order of the reads, of the piled bases at + * reads[i] for all i in offsets. Does not make a copy of the data, so it's not safe to + * go changing the reads. + * + * @param loc The genome loc to associate reads wotj + * @param reads + * @param offsets + */ + public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets) { + this.loc = loc; + this.pileupElementTracker = readsOffsets2Pileup(reads, offsets); + } + + + /** + * Create a new version of a read backed pileup at loc without any aligned reads + */ + public ReadBackedPileupImpl(GenomeLoc loc) { + this(loc, new UnifiedPileupElementTracker()); + } + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a + * pointer to pileup. Don't go changing the data in pileup. + */ + public ReadBackedPileupImpl(GenomeLoc loc, List pileup) { + if (loc == null) throw new ReviewedStingException("Illegal null genomeloc in ReadBackedPileup"); + if (pileup == null) throw new ReviewedStingException("Illegal null pileup in ReadBackedPileup"); + + this.loc = loc; + this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * + * @param loc + * @param pileup + */ + @Deprecated + public ReadBackedPileupImpl(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { + this(loc, pileup); + } + + protected ReadBackedPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { + this.loc = loc; + this.pileupElementTracker = tracker; + } + + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupsBySample) { + this.loc = loc; + PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); + for (Map.Entry pileupEntry : pileupsBySample.entrySet()) { + tracker.addElements(pileupEntry.getKey(), pileupEntry.getValue().pileupElementTracker); + } + this.pileupElementTracker = tracker; + } + + public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset) { + this.loc = loc; + this.pileupElementTracker = readsOffsets2Pileup(reads, offset); + } + + /** + * Helper routine for converting reads and offset lists to a PileupElement list. + * + * @param reads + * @param offsets + * @return + */ + private PileupElementTracker readsOffsets2Pileup(List reads, List offsets) { + if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); + if (offsets == null) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); + if (reads.size() != offsets.size()) + throw new ReviewedStingException("Reads and offset lists have different sizes!"); + + UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); + for (int i = 0; i < reads.size(); i++) { + GATKSAMRecord read = reads.get(i); + int offset = offsets.get(i); + pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important + } + + return pileup; + } + + /** + * Helper routine for converting reads and a single offset to a PileupElement list. + * + * @param reads + * @param offset + * @return + */ + private PileupElementTracker readsOffsets2Pileup(List reads, int offset) { + if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); + if (offset < 0) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); + + UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); + for (GATKSAMRecord read : reads) { + pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important + } + + return pileup; + } + + protected ReadBackedPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { + return new ReadBackedPileupImpl(loc, tracker); + } + + protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset) { + return LocusIteratorByState.createPileupForReadAndOffset(read, offset); + } + + // -------------------------------------------------------- + // + // Special 'constructors' + // + // -------------------------------------------------------- + + /** + * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no deletions in the pileup. + * + * @return + */ + @Override + public ReadBackedPileupImpl getPileupWithoutDeletions() { + if (getNumberOfDeletions() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutDeletions(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (!p.isDeletion()) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } else { + return this; + } + } + + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If the two reads in question disagree to their basecall, + * neither read is retained. If they agree on the base, the read with the higher + * base quality observation is retained + * + * @return the newly filtered pileup + */ + @Override + public ReadBackedPileup getOverlappingFragmentFilteredPileup() { + return getOverlappingFragmentFilteredPileup(true, true); + } + + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, + * neither read is retained. Otherwise, the read with the higher + * quality (base or mapping, depending on baseQualNotMapQual) observation is retained + * + * @return the newly filtered pileup + */ + @Override + public ReadBackedPileupImpl getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + } else { + Map filteredPileup = new HashMap(); + + for (PileupElement p : pileupElementTracker) { + String readName = p.getRead().getReadName(); + + // if we've never seen this read before, life is good + if (!filteredPileup.containsKey(readName)) { + filteredPileup.put(readName, p); + } else { + PileupElement existing = filteredPileup.get(readName); + + // if the reads disagree at this position, throw them both out. Otherwise + // keep the element with the higher quality score + if (discardDiscordant && existing.getBase() != p.getBase()) { + filteredPileup.remove(readName); + } else { + if (baseQualNotMapQual) { + if (existing.getQual() < p.getQual()) + filteredPileup.put(readName, p); + } + else { + if (existing.getMappingQual() < p.getMappingQual()) + filteredPileup.put(readName, p); + } + } + } + } + + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement filteredElement : filteredPileup.values()) + filteredTracker.add(filteredElement); + + return createNewPileup(loc, filteredTracker); + } + } + + + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + @Override + public ReadBackedPileupImpl getPileupWithoutMappingQualityZeroReads() { + if (getNumberOfMappingQualityZeroReads() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutMappingQualityZeroReads(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (p.getRead().getMappingQuality() > 0) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } else { + return this; + } + } + + public ReadBackedPileupImpl getPositiveStrandPileup() { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPositiveStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (!p.getRead().getReadNegativeStrandFlag()) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Gets the pileup consisting of only reads on the negative strand. + * + * @return A read-backed pileup consisting only of reads on the negative strand. + */ + public ReadBackedPileupImpl getNegativeStrandPileup() { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getNegativeStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (p.getRead().getReadNegativeStrandFlag()) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Gets a pileup consisting of all those elements passed by a given filter. + * + * @param filter Filter to use when testing for elements. + * @return a pileup without the given filtered elements. + */ + public ReadBackedPileupImpl getFilteredPileup(PileupElementFilter filter) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getFilteredPileup(filter); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : pileupElementTracker) { + if (filter.allow(p)) + filteredTracker.add(p); + } + + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from + * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. + * + * @param minBaseQ + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileupImpl getBaseAndMappingFilteredPileup(int minBaseQ, int minMapQ) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ, minMapQ); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : pileupElementTracker) { + if (p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ)) { + filteredTracker.add(p); + } + } + + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * + * @param minBaseQ + * @return + */ + @Override + public ReadBackedPileup getBaseFilteredPileup(int minBaseQ) { + return getBaseAndMappingFilteredPileup(minBaseQ, -1); + } + + /** + * Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileup getMappingFilteredPileup(int minMapQ) { + return getBaseAndMappingFilteredPileup(-1, minMapQ); + } + + /** + * Gets a list of the read groups represented in this pileup. + * + * @return + */ + @Override + public Collection getReadGroups() { + Set readGroups = new HashSet(); + for (PileupElement pileupElement : this) + readGroups.add(pileupElement.getRead().getReadGroup().getReadGroupId()); + return readGroups; + } + + /** + * Gets the pileup for a given read group. Horrendously inefficient at this point. + * + * @param targetReadGroupId Identifier for the read group. + * @return A read-backed pileup containing only the reads in the given read group. + */ + @Override + public ReadBackedPileupImpl getPileupForReadGroup(String targetReadGroupId) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroup(targetReadGroupId); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (targetReadGroupId != null) { + if (read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + /** + * Gets the pileup for a set of read groups. Horrendously inefficient at this point. + * + * @param rgSet List of identifiers for the read groups. + * @return A read-backed pileup containing only the reads in the given read groups. + */ + @Override + public ReadBackedPileupImpl getPileupForReadGroups(final HashSet rgSet) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroups(rgSet); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (rgSet != null && !rgSet.isEmpty()) { + if (read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + @Override + public ReadBackedPileupImpl getPileupForLane(String laneID) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForLane(laneID); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (laneID != null) { + if (read.getReadGroup() != null && + (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different + (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + public Collection getSamples() { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + return new HashSet(tracker.getSamples()); + } else { + Collection sampleNames = new HashSet(); + for (PileupElement p : this) { + GATKSAMRecord read = p.getRead(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + sampleNames.add(sampleName); + } + return sampleNames; + } + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * TODO: delete this once the experimental downsampler stabilizes + * + * @param desiredCoverage + * @return + */ + @Override + public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { + if (getNumberOfElements() <= desiredCoverage) + return this; + + // randomly choose numbers corresponding to positions in the reads list + TreeSet positions = new TreeSet(); + for (int i = 0; i < desiredCoverage; /* no update */) { + if (positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(getNumberOfElements()))) + i++; + } + + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + + int current = 0; + UnifiedPileupElementTracker filteredPileup = new UnifiedPileupElementTracker(); + for (PileupElement p : perSampleElements) { + if (positions.contains(current)) + filteredPileup.add(p); + current++; + + } + filteredTracker.addElements(sample, filteredPileup); + } + + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + Iterator positionIter = positions.iterator(); + + while (positionIter.hasNext()) { + int nextReadToKeep = (Integer) positionIter.next(); + filteredTracker.add(tracker.get(nextReadToKeep)); + } + + return createNewPileup(getLocation(), filteredTracker); + } + } + + @Override + public ReadBackedPileup getPileupForSamples(Collection sampleNames) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PileupElementTracker filteredElements = tracker.getElements(sampleNames); + return filteredElements != null ? createNewPileup(loc, filteredElements) : null; + } else { + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + @Override + public Map getPileupsForSamples(Collection sampleNames) { + Map result = new HashMap(); + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (String sample : sampleNames) { + PileupElementTracker filteredElements = tracker.getElements(sample); + if (filteredElements != null) + result.put(sample, createNewPileup(loc, filteredElements)); + } + } else { + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + for (PileupElement p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + for (Map.Entry> entry : trackerMap.entrySet()) // create the ReadBackedPileup for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); + } + return result; + } + + + @Override + public ReadBackedPileup getPileupForSample(String sampleName) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PileupElementTracker filteredElements = tracker.getElements(sampleName); + return filteredElements != null ? createNewPileup(loc, filteredElements) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (sampleName != null) { + if (read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + // -------------------------------------------------------- + // + // iterators + // + // -------------------------------------------------------- + + /** + * The best way to access PileupElements where you only care about the bases and quals in the pileup. + *

+ * for (PileupElement p : this) { doSomething(p); } + *

+ * Provides efficient iteration of the data. + * + * @return + */ + @Override + public Iterator iterator() { + return new Iterator() { + private final Iterator wrappedIterator = pileupElementTracker.iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public PileupElement next() { + return wrappedIterator.next(); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); + } + }; + } + + /** + * The best way to access PileupElements where you only care not only about bases and quals in the pileup + * but also need access to the index of the pileup element in the pile. + * + * for (ExtendedPileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + + /** + * Simple useful routine to count the number of deletion bases in this pileup + * + * @return + */ + @Override + public int getNumberOfDeletions() { + if ( nDeletions == UNINITIALIZED_CACHED_INT_VALUE ) { + nDeletions = 0; + for (PileupElement p : pileupElementTracker.unorderedIterable() ) { + if (p.isDeletion()) { + nDeletions++; + } + } + } + return nDeletions; + } + + @Override + public int getNumberOfMappingQualityZeroReads() { + if ( nMQ0Reads == UNINITIALIZED_CACHED_INT_VALUE ) { + nMQ0Reads = 0; + + for (PileupElement p : pileupElementTracker.unorderedIterable()) { + if (p.getRead().getMappingQuality() == 0) { + nMQ0Reads++; + } + } + } + + return nMQ0Reads; + } + + /** + * @return the number of physical elements in this pileup + */ + @Override + public int getNumberOfElements() { + return pileupElementTracker.size(); + } + + /** + * @return the number of abstract elements in this pileup + */ + @Override + public int depthOfCoverage() { + if (depthOfCoverage == UNINITIALIZED_CACHED_INT_VALUE) { + depthOfCoverage = pileupElementTracker.size(); + } + return depthOfCoverage; + } + + /** + * @return true if there are 0 elements in the pileup, false otherwise + */ + @Override + public boolean isEmpty() { + return getNumberOfElements() == 0; + } + + + /** + * @return the location of this pileup + */ + @Override + public GenomeLoc getLocation() { + return loc; + } + + /** + * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according + * to BaseUtils.simpleBaseToBaseIndex for each base. + * + * @return + */ + @Override + public int[] getBaseCounts() { + int[] counts = new int[4]; + + // TODO -- can be optimized with .unorderedIterable() + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (final String sample : tracker.getSamples()) { + int[] countsBySample = createNewPileup(loc, tracker.getElements(sample)).getBaseCounts(); + for (int i = 0; i < counts.length; i++) + counts[i] += countsBySample[i]; + } + } else { + for (PileupElement pile : this) { + // skip deletion sites + if (!pile.isDeletion()) { + int index = BaseUtils.simpleBaseToBaseIndex((char) pile.getBase()); + if (index != -1) + counts[index]++; + } + } + } + + return counts; + } + + @Override + public String getPileupString(Character ref) { + // In the pileup format, each line represents a genomic position, consisting of chromosome name, + // coordinate, reference base, read bases, read qualities and alignment mapping qualities. + return String.format("%s %s %c %s %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + ref, // reference base + new String(getBases()), + getQualsString()); + } + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * + * @return + */ + @Override + public List getReads() { + List reads = new ArrayList(getNumberOfElements()); + for (PileupElement pile : this) { + reads.add(pile.getRead()); + } + return reads; + } + + @Override + public int getNumberOfDeletionsAfterThisElement() { + int count = 0; + for (PileupElement p : pileupElementTracker.unorderedIterable()) { + if (p.isBeforeDeletionStart()) + count++; + } + return count; + } + + @Override + public int getNumberOfInsertionsAfterThisElement() { + int count = 0; + for (PileupElement p : pileupElementTracker.unorderedIterable()) { + if (p.isBeforeInsertion()) + count++; + } + return count; + + } + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * + * @return + */ + @Override + public List getOffsets() { + List offsets = new ArrayList(getNumberOfElements()); + for (PileupElement pile : pileupElementTracker.unorderedIterable()) { + offsets.add(pile.getOffset()); + } + return offsets; + } + + /** + * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * + * @return + */ + @Override + public byte[] getBases() { + byte[] v = new byte[getNumberOfElements()]; + int pos = 0; + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getBase(); + } + return v; + } + + /** + * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * + * @return + */ + @Override + public byte[] getQuals() { + byte[] v = new byte[getNumberOfElements()]; + int pos = 0; + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getQual(); + } + return v; + } + + /** + * Get an array of the mapping qualities + * + * @return + */ + @Override + public int[] getMappingQuals() { + final int[] v = new int[getNumberOfElements()]; + int pos = 0; + for ( final PileupElement pile : pileupElementTracker ) { + v[pos++] = pile.getRead().getMappingQuality(); + } + return v; + } + + static String quals2String(byte[] quals) { + StringBuilder qualStr = new StringBuilder(); + for (int qual : quals) { + qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea + char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 + qualStr.append(qualChar); + } + + return qualStr.toString(); + } + + private String getQualsString() { + return quals2String(getQuals()); + } + + /** + * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. + * + * @return + */ + @Override + public ReadBackedPileup getStartSortedPileup() { + + final TreeSet sortedElements = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + for (PileupElement pile : perSampleElements) + sortedElements.add(pile); + } + } + else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + for (PileupElement pile : tracker) + sortedElements.add(pile); + } + + UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); + for (PileupElement pile : sortedElements) + sortedTracker.add(pile); + + return createNewPileup(loc, sortedTracker); + } + + @Override + public FragmentCollection toFragments() { + return FragmentUtils.create(this); + } + + @Override + public ReadBackedPileup copy() { + return new ReadBackedPileupImpl(loc, pileupElementTracker.copy()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup2/Notes b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup2/Notes similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup2/Notes rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup2/Notes diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRMode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRMode.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/EventType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/EventType.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessController.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessController.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessOutput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessOutput.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessSettings.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessSettings.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamLocation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamLocation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamOutput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamOutput.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java new file mode 100644 index 000000000..b8367a7df --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -0,0 +1,484 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; + +import java.io.File; +import java.util.*; + +/** + * @author aaron + * @version 1.0 + */ +public class ArtificialSAMUtils { + public static final int DEFAULT_READ_LENGTH = 50; + + /** + * create an artificial sam file + * + * @param filename the filename to write to + * @param numberOfChromosomes the number of chromosomes + * @param startingChromosome where to start counting + * @param chromosomeSize how large each chromosome is + * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) + */ + public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { + SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); + File outFile = new File(filename); + + SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); + + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { + for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { + out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); + } + } + + out.close(); + } + + /** + * create an artificial sam file + * + * @param filename the filename to write to + * @param numberOfChromosomes the number of chromosomes + * @param startingChromosome where to start counting + * @param chromosomeSize how large each chromosome is + * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) + */ + public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { + SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); + File outFile = new File(filename); + + SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile); + + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { + for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) { + out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100)); + } + } + + out.close(); + } + + /** + * Creates an artificial sam header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc + * + * @param numberOfChromosomes the number of chromosomes to create + * @param startingChromosome the starting number for the chromosome (most likely set to 1) + * @param chromosomeSize the length of each chromosome + * @return + */ + public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + // make up some sequence records + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { + SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); + rec.setSequenceLength(chromosomeSize); + dict.addSequence(rec); + } + header.setSequenceDictionary(dict); + return header; + } + + /** + * Creates an artificial sam header based on the sequence dictionary dict + * + * @return a new sam header + */ + public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(dict); + return header; + } + + /** + * Creates an artificial sam header with standard test parameters + * + * @return the sam header + */ + public static SAMFileHeader createArtificialSamHeader() { + return createArtificialSamHeader(1, 1, 1000000); + } + + /** + * setup a default read group for a SAMFileHeader + * + * @param header the header to set + * @param readGroupID the read group ID tag + * @param sampleName the sample name + * @return the adjusted SAMFileHeader + */ + public static SAMFileHeader createDefaultReadGroup(SAMFileHeader header, String readGroupID, String sampleName) { + SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID); + rec.setSample(sampleName); + List readGroups = new ArrayList(); + readGroups.add(rec); + header.setReadGroups(readGroups); + return header; + } + + /** + * setup read groups for the specified read groups and sample names + * + * @param header the header to set + * @param readGroupIDs the read group ID tags + * @param sampleNames the sample names + * @return the adjusted SAMFileHeader + */ + public static SAMFileHeader createEnumeratedReadGroups(SAMFileHeader header, List readGroupIDs, List sampleNames) { + if (readGroupIDs.size() != sampleNames.size()) { + throw new ReviewedStingException("read group count and sample name count must be the same"); + } + + List readGroups = new ArrayList(); + + int x = 0; + for (; x < readGroupIDs.size(); x++) { + SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x)); + rec.setSample(sampleNames.get(x)); + readGroups.add(rec); + } + header.setReadGroups(readGroups); + return header; + } + + + /** + * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param length the length of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { + if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || + (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) + throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); + GATKSAMRecord record = new GATKSAMRecord(header); + record.setReadName(name); + record.setReferenceIndex(refIndex); + record.setAlignmentStart(alignmentStart); + List elements = new ArrayList(); + elements.add(new CigarElement(length, CigarOperator.characterToEnum('M'))); + record.setCigar(new Cigar(elements)); + record.setProperPairFlag(false); + + // our reads and quals are all 'A's by default + byte[] c = new byte[length]; + byte[] q = new byte[length]; + for (int x = 0; x < length; x++) + c[x] = q[x] = 'A'; + record.setReadBases(c); + record.setBaseQualities(q); + + if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setReadUnmappedFlag(true); + } + + return record; + } + + /** + * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { + if (bases.length != qual.length) { + throw new ReviewedStingException("Passed in read string is different length then the quality array"); + } + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); + rec.setReadBases(bases); + rec.setBaseQualities(qual); + rec.setReadGroup(new GATKSAMReadGroupRecord("x")); + if (refIndex == -1) { + rec.setReadUnmappedFlag(true); + } + + return rec; + } + + /** + * Create an artificial read based on the parameters + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); + rec.setCigarString(cigar); + return rec; + } + + /** + * Create an artificial read with the following default parameters : + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 + * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(byte[] bases, byte[] qual, String cigar) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); + } + + public static GATKSAMRecord createArtificialRead(Cigar cigar) { + int length = cigar.getReadLength(); + byte [] base = {'A'}; + byte [] qual = {30}; + byte [] bases = Utils.arrayFromArrayWithLength(base, length); + byte [] quals = Utils.arrayFromArrayWithLength(qual, length); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString()); + } + + + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); + GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); + + left.setReadPairedFlag(true); + right.setReadPairedFlag(true); + + left.setProperPairFlag(true); + right.setProperPairFlag(true); + + left.setFirstOfPairFlag(leftIsFirst); + right.setFirstOfPairFlag(!leftIsFirst); + + left.setReadNegativeStrandFlag(leftIsNegative); + left.setMateNegativeStrandFlag(!leftIsNegative); + right.setReadNegativeStrandFlag(!leftIsNegative); + right.setMateNegativeStrandFlag(leftIsNegative); + + left.setMateAlignmentStart(right.getAlignmentStart()); + right.setMateAlignmentStart(left.getAlignmentStart()); + + left.setMateReferenceIndex(0); + right.setMateReferenceIndex(0); + + int isize = rightStart + readLen - leftStart; + left.setInferredInsertSize(isize); + right.setInferredInsertSize(-isize); + + return Arrays.asList(left, right); + } + + /** + * Create a collection of identical artificial reads based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like positional downsampling where you care only about the position and + * number of reads, and not the other attributes. + * + * @param stackSize number of identical reads to create + * @param header the SAM header to associate each read with + * @param name name associated with each read + * @param refIndex the reference index, i.e. what chromosome to associate them with + * @param alignmentStart where to start each alignment + * @param length the length of each read + * + * @return a collection of stackSize reads all sharing the above properties + */ + public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + Collection stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + + /** + * create an iterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @return StingSAMIterator representing the specified amount of fake data + */ + public static StingSAMIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); + } + + /** + * create an iterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file + * @return StingSAMIterator representing the specified amount of fake data + */ + public static StingSAMIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); + } + + /** + * create an ArtificialSAMQueryIterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @return StingSAMIterator representing the specified amount of fake data + */ + public static ArtificialSAMQueryIterator queryReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); + } + + /** + * create an ArtificialSAMQueryIterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file + * @return StingSAMIterator representing the specified amount of fake data + */ + public static StingSAMIterator queryReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); + } + + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static StingSAMIterator createReadIterator(SAMRecord... reads) { + return createReadIterator(Arrays.asList(reads)); + } + + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static StingSAMIterator createReadIterator(List reads) { + final Iterator iter = reads.iterator(); + return new StingSAMIterator() { + @Override public void close() {} + @Override public Iterator iterator() { return iter; } + @Override public boolean hasNext() { return iter.hasNext(); } + @Override public SAMRecord next() { return iter.next(); } + @Override public void remove() { iter.remove(); } + }; + } + + private final static int ranIntInclusive(Random ran, int start, int stop) { + final int range = stop - start; + return ran.nextInt(range) + start; + } + + /** + * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with + * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert + * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second + * may be, depending on where this sampled insertSize puts it. + * + * @param header + * @param loc + * @param readLen + * @param insertSize + * @param pileupSize + * @return + */ + public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { + final Random ran = new Random(); + final boolean leftIsFirst = true; + final boolean leftIsNegative = false; + final int insertSizeVariation = insertSize / 10; + final int pos = loc.getStart(); + + final List pileupElements = new ArrayList(); + for (int i = 0; i < pileupSize / 2; i++) { + final String readName = "read" + i; + final int leftStart = ranIntInclusive(ran, 1, pos); + final int fragmentSize = (int) (ran.nextGaussian() * insertSizeVariation + insertSize); + final int rightStart = leftStart + fragmentSize - readLen; + + if (rightStart <= 0) continue; + + List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + final GATKSAMRecord left = pair.get(0); + final GATKSAMRecord right = pair.get(1); + + pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(left, pos - leftStart)); + + if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { + pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(right, pos - rightStart)); + } + } + + Collections.sort(pileupElements); + return new ReadBackedPileupImpl(loc, pileupElements); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java new file mode 100644 index 000000000..70ce68a5b --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java @@ -0,0 +1,272 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import com.google.java.contract.Ensures; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.smithwaterman.Parameters; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; + +import java.util.Arrays; +import java.util.Stack; + +/** + * Created with IntelliJ IDEA. + * User: ami + * Date: 11/26/13 + * Time: 11:33 AM + * To change this template use File | Settings | File Templates. + */ +public class CigarUtils { + + /** + * Combines equal adjacent elements of a Cigar object + * + * @param rawCigar the cigar object + * @return a combined cigar object + */ + public static Cigar combineAdjacentCigarElements(Cigar rawCigar) { + Cigar combinedCigar = new Cigar(); + CigarElement lastElement = null; + int lastElementLength = 0; + for (CigarElement cigarElement : rawCigar.getCigarElements()) { + if (lastElement != null && + ((lastElement.getOperator() == cigarElement.getOperator()) || + (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || + (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) + lastElementLength += cigarElement.getLength(); + else + { + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + lastElement = cigarElement; + lastElementLength = cigarElement.getLength(); + } + } + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + return combinedCigar; + } + + public static Cigar invertCigar (Cigar cigar) { + Stack cigarStack = new Stack(); + for (CigarElement cigarElement : cigar.getCigarElements()) + cigarStack.push(cigarElement); + + Cigar invertedCigar = new Cigar(); + while (!cigarStack.isEmpty()) + invertedCigar.add(cigarStack.pop()); + + return invertedCigar; + } + + /** + * Checks whether or not the read has any cigar element that is not H or S + * + * @param read the read + * @return true if it has any M, I or D, false otherwise + */ + public static boolean readHasNonClippedBases(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } + + public static Cigar cigarFromString(String cigarString) { + return TextCigarCodec.getSingleton().decode(cigarString); + } + + + /** + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * - No consecutive I/D elements + **/ + public static boolean isCigarValid(Cigar cigar) { + if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) + + Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator + CigarOperator startingOp = null; + CigarOperator endingOp = null; + + // check if it doesn't start with deletions + boolean readHasStarted = false; // search the list of elements for the starting operator + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (!readHasStarted) { + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + startingOp = cigarElement.getOperator(); + } + } + cigarElementStack.push(cigarElement); + } + + while (!cigarElementStack.empty()) { + CigarElement cigarElement = cigarElementStack.pop(); + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + endingOp = cigarElement.getOperator(); + break; + } + } + + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.SKIPPED_REGION && endingOp != CigarOperator.SKIPPED_REGION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) + } + + return false; + } + + public static final int countRefBasesBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex){ + int result = 0; + for(int i = cigarStartIndex; i 0 ) { + return true; +// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should always start at 0, but got " + alignment.getAlignmentStart2wrt1() + " with cigar " + alignment.getCigar()); + } + + // check that we aren't getting any S operators (which would be very bad downstream) + for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.S ) + return true; + // soft clips at the end of the alignment are really insertions +// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should never contain S operators but got cigar " + alignment.getCigar()); + } + + return false; + } + + /** + * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. + * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. + * @param cigar the cigar to left align + * @param refSeq the reference byte array + * @param readSeq the read byte array + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return the left-aligned cigar + */ + @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) + public static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + final Cigar cigarToReturn = new Cigar(); + Cigar cigarToAlign = new Cigar(); + for (int i = 0; i < cigar.numCigarElements(); i++) { + final CigarElement ce = cigar.getCigarElement(i); + if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { + cigarToAlign.add(ce); + final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false); + for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); } + refIndex += cigarToAlign.getReferenceLength(); + readIndex += cigarToAlign.getReadLength(); + cigarToAlign = new Cigar(); + } else { + cigarToAlign.add(ce); + } + } + if( !cigarToAlign.isEmpty() ) { + for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { + cigarToReturn.add(toAdd); + } + } + + final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn); + if( result.getReferenceLength() != cigar.getReferenceLength() ) + throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result); + return result; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java new file mode 100644 index 000000000..52e6e1c25 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -0,0 +1,627 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import com.google.java.contract.Ensures; +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.NGSPlatform; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.EventType; + +import java.util.*; + +/** + * @author ebanks, depristo + * GATKSAMRecord + * + * this class extends the samtools BAMRecord class (and SAMRecord) and caches important + * (and oft-accessed) data that's not already cached by the SAMRecord class + * + * IMPORTANT NOTE: Because ReadGroups are not set through the SAMRecord, + * if they are ever modified externally then one must also invoke the + * setReadGroup() method here to ensure that the cache is kept up-to-date. + * + * WARNING -- GATKSAMRecords cache several values (that are expensive to compute) + * that depending on the inferred insert size and alignment starts and stops of this read and its mate. + * Changing these values in any way will invalidate the cached value. However, we do not monitor those setter + * functions, so modifying a GATKSAMRecord in any way may result in stale cached values. + */ +public class GATKSAMRecord extends BAMRecord { + // Base Quality Score Recalibrator specific attribute tags + public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions + public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions + + /** + * The default quality score for an insertion or deletion, if + * none are provided for this read. + */ + public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45; + + // the SAMRecord data we're caching + private String mReadString = null; + private GATKSAMReadGroupRecord mReadGroup = null; + private final static int UNINITIALIZED = -1; + private int softStart = UNINITIALIZED; + private int softEnd = UNINITIALIZED; + private Integer adapterBoundary = null; + + private boolean isStrandlessRead = false; + + // because some values can be null, we don't want to duplicate effort + private boolean retrievedReadGroup = false; + + // These temporary attributes were added here to make life easier for + // certain algorithms by providing a way to label or attach arbitrary data to + // individual GATKSAMRecords. + // These attributes exist in memory only, and are never written to disk. + private Map temporaryAttributes; + + /** + * HACK TO CREATE GATKSAMRECORD WITH ONLY A HEADER FOR TESTING PURPOSES ONLY + * @param header + */ + public GATKSAMRecord(final SAMFileHeader header) { + this(new SAMRecord(header)); + } + + /** + * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY + * @param read + */ + public GATKSAMRecord(final SAMRecord read) { + super(read.getHeader(), read.getMateReferenceIndex(), + read.getAlignmentStart(), + read.getReadName() != null ? (short)read.getReadNameLength() : 0, + (short)read.getMappingQuality(), + 0, + read.getCigarLength(), + read.getFlags(), + read.getReadLength(), + read.getMateReferenceIndex(), + read.getMateAlignmentStart(), + read.getInferredInsertSize(), + null); + SAMReadGroupRecord samRG = read.getReadGroup(); + clearAttributes(); + if (samRG != null) { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); + setReadGroup(rg); + } + } + + public GATKSAMRecord(final SAMFileHeader header, + final int referenceSequenceIndex, + final int alignmentStart, + final short readNameLength, + final short mappingQuality, + final int indexingBin, + final int cigarLen, + final int flags, + final int readLen, + final int mateReferenceSequenceIndex, + final int mateAlignmentStart, + final int insertSize, + final byte[] variableLengthBlock) { + super(header, referenceSequenceIndex, alignmentStart, readNameLength, mappingQuality, indexingBin, cigarLen, + flags, readLen, mateReferenceSequenceIndex, mateAlignmentStart, insertSize, variableLengthBlock); + } + + public static GATKSAMRecord createRandomRead(int length) { + List cigarElements = new LinkedList<>(); + cigarElements.add(new CigarElement(length, CigarOperator.M)); + Cigar cigar = new Cigar(cigarElements); + return ArtificialSAMUtils.createArtificialRead(cigar); + } + + /////////////////////////////////////////////////////////////////////////////// + // *** support for reads without meaningful strand information ***// + /////////////////////////////////////////////////////////////////////////////// + + /** + * Does this read have a meaningful strandedness value? + * + * Some advanced types of reads, such as reads coming from merged fragments, + * don't have meaningful strandedness values, as they are composites of multiple + * other reads. Strandless reads need to be handled specially by code that cares about + * stranded information, such as FS. + * + * @return true if this read doesn't have meaningful strand information + */ + public boolean isStrandless() { + return isStrandlessRead; + } + + /** + * Set the strandless state of this read to isStrandless + * @param isStrandless true if this read doesn't have a meaningful strandedness value + */ + public void setIsStrandless(final boolean isStrandless) { + this.isStrandlessRead = isStrandless; + } + + @Override + public boolean getReadNegativeStrandFlag() { + return ! isStrandless() && super.getReadNegativeStrandFlag(); + } + + @Override + public void setReadNegativeStrandFlag(final boolean flag) { + if ( isStrandless() ) + throw new IllegalStateException("Cannot set the strand of a strandless read"); + super.setReadNegativeStrandFlag(flag); + } + + + /////////////////////////////////////////////////////////////////////////////// + // *** The following methods are overloaded to cache the appropriate data ***// + /////////////////////////////////////////////////////////////////////////////// + + @Override + public String getReadString() { + if ( mReadString == null ) + mReadString = super.getReadString(); + return mReadString; + } + + @Override + public void setReadString(String s) { + super.setReadString(s); + mReadString = s; + } + + /** + * Get the GATKSAMReadGroupRecord of this read + * @return a non-null GATKSAMReadGroupRecord + */ + @Override + public GATKSAMReadGroupRecord getReadGroup() { + if ( ! retrievedReadGroup ) { + final SAMReadGroupRecord rg = super.getReadGroup(); + + // three cases: rg may be null (no rg, rg may already be a GATKSAMReadGroupRecord, or it may be + // a regular SAMReadGroupRecord in which case we have to make it a GATKSAMReadGroupRecord + if ( rg == null ) + mReadGroup = null; + else if ( rg instanceof GATKSAMReadGroupRecord ) + mReadGroup = (GATKSAMReadGroupRecord)rg; + else + mReadGroup = new GATKSAMReadGroupRecord(rg); + + retrievedReadGroup = true; + } + return mReadGroup; + } + + public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { + mReadGroup = readGroup; + retrievedReadGroup = true; + setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! + } + + + @Override + public int hashCode() { + return super.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + + if (!(o instanceof GATKSAMRecord)) return false; + + // note that we do not consider the GATKSAMRecord internal state at all + return super.equals(o); + } + + /** + * Setters and Accessors for base insertion and base deletion quality scores + */ + public void setBaseQualities( final byte[] quals, final EventType errorModel ) { + switch( errorModel ) { + case BASE_SUBSTITUTION: + setBaseQualities(quals); + break; + case BASE_INSERTION: + setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals) ); + break; + case BASE_DELETION: + setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals) ); + break; + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + } + } + + public byte[] getBaseQualities( final EventType errorModel ) { + switch( errorModel ) { + case BASE_SUBSTITUTION: + return getBaseQualities(); + case BASE_INSERTION: + return getBaseInsertionQualities(); + case BASE_DELETION: + return getBaseDeletionQualities(); + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + } + } + + /** + * @return whether or not this read has base insertion or deletion qualities (one of the two is sufficient to return true) + */ + public boolean hasBaseIndelQualities() { + return getAttribute( BQSR_BASE_INSERTION_QUALITIES ) != null || getAttribute( BQSR_BASE_DELETION_QUALITIES ) != null; + } + + /** + * @return the base deletion quality or null if read doesn't have one + */ + public byte[] getExistingBaseInsertionQualities() { + return SAMUtils.fastqToPhred( getStringAttribute(BQSR_BASE_INSERTION_QUALITIES)); + } + + /** + * @return the base deletion quality or null if read doesn't have one + */ + public byte[] getExistingBaseDeletionQualities() { + return SAMUtils.fastqToPhred( getStringAttribute(BQSR_BASE_DELETION_QUALITIES)); + } + + /** + * Default utility to query the base insertion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) + * and assigns it to the read. + * + * @return the base insertion quality array + */ + public byte[] getBaseInsertionQualities() { + byte [] quals = getExistingBaseInsertionQualities(); + if( quals == null ) { + quals = new byte[getBaseQualities().length]; + Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + } + return quals; + } + + /** + * Default utility to query the base deletion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) + * and assigns it to the read. + * + * @return the base deletion quality array + */ + public byte[] getBaseDeletionQualities() { + byte[] quals = getExistingBaseDeletionQualities(); + if( quals == null ) { + quals = new byte[getBaseQualities().length]; + Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + } + return quals; + } + + /** + * Efficient caching accessor that returns the GATK NGSPlatform of this read + * @return + */ + public NGSPlatform getNGSPlatform() { + return getReadGroup().getNGSPlatform(); + } + + /////////////////////////////////////////////////////////////////////////////// + // *** GATKSAMRecord specific methods ***// + /////////////////////////////////////////////////////////////////////////////// + + /** + * Checks whether an attribute has been set for the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key key + * @return True if an attribute has been set for this key. + */ + public boolean containsTemporaryAttribute(Object key) { + return temporaryAttributes != null && temporaryAttributes.containsKey(key); + } + + /** + * Sets the key to the given value, replacing any previous value. The previous + * value is returned. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key key + * @param value value + * @return attribute + */ + public Object setTemporaryAttribute(Object key, Object value) { + if(temporaryAttributes == null) { + temporaryAttributes = new HashMap<>(); + } + return temporaryAttributes.put(key, value); + } + + /** + * Looks up the value associated with the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key key + * @return The value, or null. + */ + public Object getTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.get(key); + } + return null; + } + + /** + * Checks whether if the read has any bases. + * + * Empty reads can be dangerous as it may have no cigar strings, no read names and + * other missing attributes. + * + * @return true if the read has no bases + */ + public boolean isEmpty() { + return super.getReadBases() == null || super.getReadLength() == 0; + } + + /** + * Clears all attributes except ReadGroup of the read. + */ + public GATKSAMRecord simplify () { + GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information + byte[] insQuals = (this.getAttribute(BQSR_BASE_INSERTION_QUALITIES) == null) ? null : getBaseInsertionQualities(); + byte[] delQuals = (this.getAttribute(BQSR_BASE_DELETION_QUALITIES) == null) ? null : getBaseDeletionQualities(); + this.clearAttributes(); // clear all attributes from the read + this.setReadGroup(rg); // restore read group + if (insQuals != null) + this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any + if (delQuals != null) + this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any + return this; + } + + /** + * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped start of the read taking soft clips (but not hard clips) into account + */ + public int getSoftStart() { + if ( softStart == UNINITIALIZED ) { + softStart = getAlignmentStart(); + for (final CigarElement cig : getCigar().getCigarElements()) { + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP) + softStart -= cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) + break; + } + } + return softStart; + } + + /** + * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped end of the read taking soft clips (but not hard clips) into account + */ + public int getSoftEnd() { + if ( softEnd == UNINITIALIZED ) { + boolean foundAlignedBase = false; + softEnd = getAlignmentEnd(); + final List cigs = getCigar().getCigarElements(); + for (int i = cigs.size() - 1; i >= 0; --i) { + final CigarElement cig = cigs.get(i); + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read + softEnd += cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) { + foundAlignedBase = true; + break; + } + } + if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end + softEnd = getAlignmentEnd(); + } + } + + return softEnd; + } + + /** + * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache + * so that the next call to getSoftStart/End will recalculate it lazily. + */ + public void resetSoftStartAndEnd() { + softStart = -1; + softEnd = -1; + } + + /** + * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache + * so that the next call to getSoftStart/End will recalculate it lazily. + */ + public void resetSoftStartAndEnd(int softStart, int softEnd) { + this.softStart = softStart; + this.softEnd = softEnd; + } + + /** + * Determines the original alignment start of a previously clipped read. + * + * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end + * + * @return the alignment start of a read before it was clipped + */ + public int getOriginalAlignmentStart() { + return getUnclippedStart(); + } + + /** + * Determines the original alignment end of a previously clipped read. + * + * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end + * + * @return the alignment end of a read before it was clipped + */ + public int getOriginalAlignmentEnd() { + return getUnclippedEnd(); + } + + /** + * Creates an empty GATKSAMRecord with the read's header, read group and mate + * information, but empty (not-null) fields: + * - Cigar String + * - Read Bases + * - Base Qualities + * + * Use this method if you want to create a new empty GATKSAMRecord based on + * another GATKSAMRecord + * + * @param read a read to copy the header from + * @return a read with no bases but safe for the GATK + */ + public static GATKSAMRecord emptyRead(GATKSAMRecord read) { + GATKSAMRecord emptyRead = new GATKSAMRecord(read.getHeader(), + read.getReferenceIndex(), + 0, + (short) 0, + (short) 0, + 0, + 0, + read.getFlags(), + 0, + read.getMateReferenceIndex(), + read.getMateAlignmentStart(), + read.getInferredInsertSize(), + null); + + emptyRead.setCigarString(""); + emptyRead.setReadBases(new byte[0]); + emptyRead.setBaseQualities(new byte[0]); + + SAMReadGroupRecord samRG = read.getReadGroup(); + emptyRead.clearAttributes(); + if (samRG != null) { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); + emptyRead.setReadGroup(rg); + } + + return emptyRead; + } + + /** + * Creates a new GATKSAMRecord with the source read's header, read group and mate + * information, but with the following fields set to user-supplied values: + * - Read Bases + * - Base Qualities + * - Base Insertion Qualities + * - Base Deletion Qualities + * + * Cigar string is empty (not-null) + * + * Use this method if you want to create a new GATKSAMRecord based on + * another GATKSAMRecord, but with modified bases and qualities + * + * @param read a read to copy the header from + * @param readBases an array containing the new bases you wish use in place of the originals + * @param baseQualities an array containing the new base qualities you wish use in place of the originals + * @param baseInsertionQualities an array containing the new base insertion qaulities + * @param baseDeletionQualities an array containing the new base deletion qualities + * @return a read with modified bases and qualities, safe for the GATK + */ + public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read, + final byte[] readBases, + final byte[] baseQualities, + final byte[] baseInsertionQualities, + final byte[] baseDeletionQualities) { + if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length ) + throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length + + " vs Base Q's:" + baseQualities.length + + " vs Insert Q's:" + baseInsertionQualities.length + + " vs Delete Q's:" + baseDeletionQualities.length); + + final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read); + processedRead.setReadBases(readBases); + processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION); + processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION); + processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION); + + return processedRead; + } + + /** + * Shallow copy of everything, except for the attribute list and the temporary attributes. + * A new list of the attributes is created for both, but the attributes themselves are copied by reference. + * This should be safe because callers should never modify a mutable value returned by any of the get() methods anyway. + * + * @return a shallow copy of the GATKSAMRecord + * @throws CloneNotSupportedException + */ + @Override + public Object clone() throws CloneNotSupportedException { + final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); + if (temporaryAttributes != null) { + clone.temporaryAttributes = new HashMap<>(); + for (Object attribute : temporaryAttributes.keySet()) + clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); + } + return clone; + } + + /** + * A caching version of ReadUtils.getAdaptorBoundary() + * + * see #ReadUtils.getAdaptorBoundary(SAMRecord) for more information about the meaning of this function + * + * WARNING -- this function caches a value depending on the inferred insert size and alignment starts + * and stops of this read and its mate. Changing these values in any way will invalidate the cached value. + * However, we do not monitor those setter functions, so modifying a GATKSAMRecord in any way may + * result in stale cached values. + * + * @return the result of calling ReadUtils.getAdaptorBoundary on this read + */ + @Ensures("result == ReadUtils.getAdaptorBoundary(this)") + public int getAdaptorBoundary() { + if ( adapterBoundary == null ) + adapterBoundary = ReadUtils.getAdaptorBoundary(this); + return adapterBoundary; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java new file mode 100644 index 000000000..2b6654bcd --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -0,0 +1,966 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.util.*; + +/** + * A miscellaneous collection of utilities for working with SAM files, headers, etc. + * Static methods only, please. + * + * @author mhanna + * @version 0.1 + */ +public class ReadUtils { + private final static Logger logger = Logger.getLogger(ReadUtils.class); + + private static final String OFFSET_OUT_OF_BOUNDS_EXCEPTION = "Offset cannot be greater than read length %d : %d"; + private static final String OFFSET_NOT_ZERO_EXCEPTION = "We ran past the end of the read and never found the offset, something went wrong!"; + + private ReadUtils() { + } + + private static final int DEFAULT_ADAPTOR_SIZE = 100; + public static final int CLIPPING_GOAL_NOT_REACHED = -1; + + /** + * A marker to tell which end of the read has been clipped + */ + public enum ClippingTail { + LEFT_TAIL, + RIGHT_TAIL + } + + /** + * A HashMap of the SAM spec read flag names + * + * Note: This is not being used right now, but can be useful in the future + */ + private static final Map readFlagNames = new HashMap(); + + static { + readFlagNames.put(0x1, "Paired"); + readFlagNames.put(0x2, "Proper"); + readFlagNames.put(0x4, "Unmapped"); + readFlagNames.put(0x8, "MateUnmapped"); + readFlagNames.put(0x10, "Forward"); + //readFlagNames.put(0x20, "MateForward"); + readFlagNames.put(0x40, "FirstOfPair"); + readFlagNames.put(0x80, "SecondOfPair"); + readFlagNames.put(0x100, "NotPrimary"); + readFlagNames.put(0x200, "NON-PF"); + readFlagNames.put(0x400, "Duplicate"); + } + + /** + * This enum represents all the different ways in which a read can overlap an interval. + * + * NO_OVERLAP_CONTIG: + * read and interval are in different contigs. + * + * NO_OVERLAP_LEFT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * NO_OVERLAP_RIGHT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * OVERLAP_LEFT: + * the read starts before the beginning of the interval but ends inside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * OVERLAP_RIGHT: + * the read starts inside the interval but ends outside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * OVERLAP_LEFT_AND_RIGHT: + * the read starts before the interval and ends after the interval + * + * |-----------| (interval) + * <-------------------> (read) + * + * OVERLAP_CONTAINED: + * the read starts and ends inside the interval + * + * |----------------| (interval) + * <--------> (read) + */ + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} + + /** + * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular + * SAMFileWriter without compression otherwise. + * + * @param header + * @param presorted + * @param file + * @param compression + * @return a SAMFileWriter with the compression level if it is a bam. + */ + public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { + validateCompressionLevel(compression); + if (file.endsWith(".bam")) + return new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, presorted, new File(file), compression); + return new SAMFileWriterFactory().setCreateIndex(true).makeSAMOrBAMWriter(header, presorted, new File(file)); + } + + public static int validateCompressionLevel(final int requestedCompressionLevel) { + if ( requestedCompressionLevel < 0 || requestedCompressionLevel > 9 ) + throw new UserException.BadArgumentValue("compress", "Compression level must be 0-9 but got " + requestedCompressionLevel); + return requestedCompressionLevel; + } + + /** + * is this base inside the adaptor of the read? + * + * There are two cases to treat here: + * + * 1) Read is in the negative strand => Adaptor boundary is on the left tail + * 2) Read is in the positive strand => Adaptor boundary is on the right tail + * + * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) + * + * @param read the read to test + * @param basePos base position in REFERENCE coordinates (not read coordinates) + * @return whether or not the base is in the adaptor + */ + public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { + final int adaptorBoundary = read.getAdaptorBoundary(); + if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + return false; + + return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; + } + + /** + * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to + * the read boundary. If the read is in the positive strand, this is the first base after the end of the + * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the + * beginning of the fragment. + * + * There are two cases we need to treat here: + * + * 1) Our read is in the reverse strand : + * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * + * 2) Our read is in the forward strand : + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * @param read the read being tested for the adaptor boundary + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. + * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. + */ + public static int getAdaptorBoundary(final SAMRecord read) { + if ( ! hasWellDefinedFragmentSize(read) ) { + return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; + } else if ( read.getReadNegativeStrandFlag() ) { + return read.getMateAlignmentStart() - 1; // case 1 (see header) + } else { + final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) + return read.getAlignmentStart() + insertSize + 1; // case 2 (see header) + } + } + + public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; + + /** + * Can the adaptor sequence of read be reliably removed from the read based on the alignment of + * read and its mate? + * + * @param read the read to check + * @return true if it can, false otherwise + */ + public static boolean hasWellDefinedFragmentSize(final SAMRecord read) { + if ( read.getInferredInsertSize() == 0 ) + // no adaptors in reads with mates in another chromosome or unmapped pairs + return false; + if ( ! read.getReadPairedFlag() ) + // only reads that are paired can be adaptor trimmed + return false; + if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() ) + // only reads when both reads are mapped can be trimmed + return false; +// if ( ! read.getProperPairFlag() ) +// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs +// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed +// return false; + if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) + // sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand + return false; + + if ( read.getReadNegativeStrandFlag() ) { + // we're on the negative strand, so our read runs right to left + return read.getAlignmentEnd() > read.getMateAlignmentStart(); + } else { + // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start) + return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize(); + } + } + + /** + * is the read a 454 read? + * + * @param read the read to test + * @return checks the read group tag PL for the default 454 tag + */ + public static boolean is454Read(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.LS454; + } + + /** + * is the read an IonTorrent read? + * + * @param read the read to test + * @return checks the read group tag PL for the default ion tag + */ + public static boolean isIonRead(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.ION_TORRENT; + } + + /** + * is the read a SOLiD read? + * + * @param read the read to test + * @return checks the read group tag PL for the default SOLiD tag + */ + public static boolean isSOLiDRead(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.SOLID; + } + + /** + * is the read a SLX read? + * + * @param read the read to test + * @return checks the read group tag PL for the default SLX tag + */ + public static boolean isIlluminaRead(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.ILLUMINA; + } + + /** + * checks if the read has a platform tag in the readgroup equal to 'name'. + * Assumes that 'name' is upper-cased. + * + * @param read the read to test + * @param name the upper-cased platform name to test + * @return whether or not name == PL tag in the read group of read + */ + public static boolean isPlatformRead(GATKSAMRecord read, String name) { + + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); + } + return false; + } + + + /** + * Returns the collections of reads sorted in coordinate order, according to the order defined + * in the reads themselves + * + * @param reads + * @return + */ + public final static List sortReadsByCoordinate(List reads) { + final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); + Collections.sort(reads, comparer); + return reads; + } + + /** + * If a read starts in INSERTION, returns the first element length. + * + * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. + * + * @param read + * @return the length of the first insertion, or 0 if there is none (see warning). + */ + public final static int getFirstInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(0); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + /** + * If a read ends in INSERTION, returns the last element length. + * + * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. + * + * @param read + * @return the length of the last insertion, or 0 if there is none (see warning). + */ + public final static int getLastInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + /** + * Determines what is the position of the read in relation to the interval. + * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. + * @param read the read + * @param interval the interval + * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) + */ + public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { + + int sStart = read.getSoftStart(); + int sStop = read.getSoftEnd(); + int uStart = read.getUnclippedStart(); + int uStop = read.getUnclippedEnd(); + + if ( !read.getReferenceName().equals(interval.getContig()) ) + return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; + + else if ( uStop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; + + else if ( uStart > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; + + else if ( sStop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; + + else if ( sStart > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; + + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) + return ReadAndIntervalOverlap.OVERLAP_CONTAINED; + + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) + return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; + + else if ( (sStart < interval.getStart()) ) + return ReadAndIntervalOverlap.OVERLAP_LEFT; + + else + return ReadAndIntervalOverlap.OVERLAP_RIGHT; + } + + /** + * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of + * two corner cases: + * + * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside + * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it + * doesn't matter because it already returns the previous base by default. + * + * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the + * read starts with an insertion, and you're requesting the first read based coordinate, it will skip + * the leading insertion (because it has the same reference coordinate as the following base). + * + * @param read + * @param refCoord + * @param tail + * @return the read coordinate corresponding to the requested reference coordinate for clipping. + */ + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) + @Ensures({"result >= 0", "result < read.getReadLength()"}) + public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); + } + + public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) { + final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord); + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false); + } + + public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { + Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); + int readCoord = result.getFirst(); + + // Corner case one: clipping the right tail and falls on deletion, move to the next + // read coordinate. It is not a problem for the left tail because the default answer + // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. + if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) + readCoord++; + + // clipping the left tail and first base is insertion, go to the next read coordinate + // with the same reference coordinate. Advance to the next cigar element, or to the + // end of the read if there is no next element. + final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar); + if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null) + readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1); + + return readCoord; + } + + /** + * Returns the read coordinate corresponding to the requested reference coordinate. + * + * WARNING: if the requested reference coordinate happens to fall inside or just before a deletion (or skipped region) in the read, this function + * will return the last read base before the deletion (or skipped region). This function returns a + * Pair(int readCoord, boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion) so you can choose which readCoordinate to use when faced with + * a deletion (or skipped region). + * + * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a + * pre-processed result according to normal clipping needs. Or you can use this function and tailor the + * behavior to your needs. + * + * @param read + * @param refCoord the requested reference coordinate + * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) + */ + @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) + @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) + //TODO since we do not have contracts any more, should we check for the requirements in the method code? + public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); + } + + public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { + int readBases = 0; + int refBases = 0; + boolean fallsInsideDeletionOrSkippedRegion = false; + boolean endJustBeforeDeletionOrSkippedRegion = false; + boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false; + + final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases + if (goal < 0) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + } + boolean goalReached = refBases == goal; + + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + final CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; + + refBases += shift; + } + goalReached = refBases == goal; + + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); + + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + final boolean endsWithinCigar = shift < cigarElement.getLength(); + + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); + } + } + + CigarElement nextCigarElement = null; + + // if we end inside the current cigar element, we just have to check if it is a deletion (or skipped region) + if (endsWithinCigar) + fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) ; + + // if we end outside the current cigar element, we need to check if the next element is an insertion, deletion or skipped region. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); + } + } + + nextCigarElement = cigarElementIterator.next(); + } + + // if it's a deletion (or skipped region), we will pass the information on to be handled downstream. + endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION); + } + + fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion; + + // If we reached our goal outside a deletion (or skipped region), add the shift + if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; + + // If we reached our goal just before a deletion (or skipped region) we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (or skipped region) (see warning in function contracts) + else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) + readBases += shift - 1; + + // If we reached our goal inside a deletion (or skipped region), or just between a deletion and a skipped region, + // then we must backtrack to the last base before the deletion (or skipped region) + else if (fallsInsideDeletionOrSkippedRegion || + (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) || + (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D))) + readBases--; + } + } + + if (!goalReached) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); + } + } + + return new Pair(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion); + } + + /** + * Compares two SAMRecords only the basis on alignment start. Note that + * comparisons are performed ONLY on the basis of alignment start; any + * two SAM records with the same alignment start will be considered equal. + * + * Unmapped alignments will all be considered equal. + */ + + @Requires({"read1 != null", "read2 != null"}) + public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { + AlignmentStartComparator comp = new AlignmentStartComparator(); + return comp.compare(read1, read2); + } + + /** + * Is a base inside a read? + * + * @param read the read to evaluate + * @param referenceCoordinate the reference coordinate of the base to test + * @return true if it is inside the read, false otherwise. + */ + public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { + return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); + } + + /** + * Is this read all insertion? + * + * @param read + * @return whether or not the only element in the cigar string is an Insertion + */ + public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + + /** + * @see #readStartsWithInsertion(net.sf.samtools.Cigar, boolean) with ignoreClipOps set to true + */ + public static CigarElement readStartsWithInsertion(final Cigar cigarForRead) { + return readStartsWithInsertion(cigarForRead, true); + } + + /** + * Checks if a read starts with an insertion. + * + * @param cigarForRead the CIGAR to evaluate + * @param ignoreSoftClipOps should we ignore S operators when evaluating whether an I operator is at the beginning? Note that H operators are always ignored. + * @return the element if it's a leading insertion or null otherwise + */ + public static CigarElement readStartsWithInsertion(final Cigar cigarForRead, final boolean ignoreSoftClipOps) { + for ( final CigarElement cigarElement : cigarForRead.getCigarElements() ) { + if ( cigarElement.getOperator() == CigarOperator.INSERTION ) + return cigarElement; + + else if ( cigarElement.getOperator() != CigarOperator.HARD_CLIP && ( !ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP) ) + break; + } + return null; + } + + /** + * Returns the coverage distribution of a list of reads within the desired region. + * + * See getCoverageDistributionOfRead for information on how the coverage is calculated. + * + * @param list the list of reads covering the region + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { + int [] totalCoverage = new int[stopLocation - startLocation + 1]; + + for (GATKSAMRecord read : list) { + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); + } + + return totalCoverage; + } + + /** + * Returns the coverage distribution of a single read within the desired region. + * + * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample + * reads for variant regions, and deletions count as variants) + * + * @param read the read to get the coverage distribution of + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { + int [] coverage = new int[stopLocation - startLocation + 1]; + int refLocation = read.getSoftStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + switch (cigarElement.getOperator()) { + case S: + case M: + case EQ: + case N: + case X: + case D: + for (int i = 0; i < cigarElement.getLength(); i++) { + if (refLocation >= startLocation && refLocation <= stopLocation) { + coverage[refLocation - startLocation]++; + } + refLocation++; + } + break; + + case P: + case I: + case H: + break; + } + + if (refLocation > stopLocation) + break; + } + return coverage; + } + + /** + * Makes association maps for the reads and loci coverage as described below : + * + * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. + * Note: Locus is in reference coordinates. + * Example: Locus => {read1, read2, ..., readN} + * + * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. + * Example: Read => {true, true, false, ... false} + * + * @param readList the list of reads to generate the association mappings + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return the two hashmaps described above + */ + public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { + int arraySize = stopLocation - startLocation + 1; + + HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); + HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); + + for (int i = startLocation; i <= stopLocation; i++) + locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists + + for (GATKSAMRecord read : readList) { + readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays + + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + + for (int i = 0; i < readCoverage.length; i++) { + int refLocation = i + startLocation; + if (readCoverage[i] > 0) { + // Update the hash for this locus + HashSet readSet = locusToReadMap.get(refLocation); + readSet.add(read); + + // Add this locus to the read hash + readToLocusMap.get(read)[refLocation - startLocation] = true; + } + else + // Update the boolean array with a 'no coverage' from this read to this locus + readToLocusMap.get(read)[refLocation-startLocation] = false; + } + } + return new Pair>, HashMap>(locusToReadMap, readToLocusMap); + } + + /** + * Create random read qualities + * + * @param length the length of the read + * @return an array with randomized base qualities between 0 and 50 + */ + public static byte[] createRandomReadQuals(int length) { + Random random = GenomeAnalysisEngine.getRandomGenerator(); + byte[] quals = new byte[length]; + for (int i = 0; i < length; i++) + quals[i] = (byte) random.nextInt(50); + return quals; + } + + /** + * Create random read qualities + * + * @param length the length of the read + * @param allowNs whether or not to allow N's in the read + * @return an array with randomized bases (A-N) with equal probability + */ + public static byte[] createRandomReadBases(int length, boolean allowNs) { + Random random = GenomeAnalysisEngine.getRandomGenerator(); + int numberOfBases = allowNs ? 5 : 4; + byte[] bases = new byte[length]; + for (int i = 0; i < length; i++) { + switch (random.nextInt(numberOfBases)) { + case 0: + bases[i] = 'A'; + break; + case 1: + bases[i] = 'C'; + break; + case 2: + bases[i] = 'G'; + break; + case 3: + bases[i] = 'T'; + break; + case 4: + bases[i] = 'N'; + break; + default: + throw new ReviewedStingException("Something went wrong, this is just impossible"); + } + } + return bases; + } + + public static GATKSAMRecord createRandomRead(int length) { + return createRandomRead(length, true); + } + + public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { + byte[] quals = ReadUtils.createRandomReadQuals(length); + byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); + return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + } + + + public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { + String[] sequenceRecordNames = new String[sequenceDictionary.size()]; + int sequenceRecordIndex = 0; + for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) + sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); + return Arrays.deepToString(sequenceRecordNames); + } + + /** + * Calculates the reference coordinate for a read coordinate + * + * @param read the read + * @param offset the base in the read (coordinate in the read) + * @return the reference coordinate correspondent to this base + */ + public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { + if (offset > read.getReadLength()) + throw new ReviewedStingException(String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); + + long location = read.getAlignmentStart(); + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (offset > 0 && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + long move = 0; + if (cigarElement.getOperator().consumesReferenceBases()) + move = (long) Math.min(cigarElement.getLength(), offset); + location += move; + offset -= move; + } + if (offset > 0 && !cigarElementIterator.hasNext()) + throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION); + + return location; + } + + /** + * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. + * + * Example: + * D -> 2, 34, 75 + * I -> 55 + * S -> 0, 101 + * H -> 101 + * + * @param read the read + * @return a map with the properties described above. See example + */ + public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { + Map> events = new HashMap>(); + + int position = 0; + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + CigarOperator op = cigarElement.getOperator(); + if (op.consumesReadBases()) { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + for (int i = position; i < cigarElement.getLength(); i++) + list.add(position++); + } + else { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + list.add(position); + } + } + return events; + } + + /** + * Given a read, outputs the read bases in a string format + * + * @param read the read + * @return a string representation of the read bases + */ + public static String convertReadBasesToString(GATKSAMRecord read) { + String bases = ""; + for (byte b : read.getReadBases()) { + bases += (char) b; + } + return bases.toUpperCase(); + } + + /** + * Given a read, outputs the base qualities in a string format + * + * @param quals the read qualities + * @return a string representation of the base qualities + */ + public static String convertReadQualToString(byte[] quals) { + String result = ""; + for (byte b : quals) { + result += (char) (33 + b); + } + return result; + } + + /** + * Given a read, outputs the base qualities in a string format + * + * @param read the read + * @return a string representation of the base qualities + */ + public static String convertReadQualToString(GATKSAMRecord read) { + return convertReadQualToString(read.getBaseQualities()); + } + + /** + * Returns the reverse complement of the read bases + * + * @param bases the read bases + * @return the reverse complement of the read bases + */ + public static String getBasesReverseComplement(byte[] bases) { + String reverse = ""; + for (int i = bases.length-1; i >=0; i--) { + reverse += (char) BaseUtils.getComplement(bases[i]); + } + return reverse; + } + + /** + * Returns the reverse complement of the read bases + * + * @param read the read + * @return the reverse complement of the read bases + */ + public static String getBasesReverseComplement(GATKSAMRecord read) { + return getBasesReverseComplement(read.getReadBases()); + } + + /** + * Calculate the maximum read length from the given list of reads. + * @param reads list of reads + * @return non-negative integer + */ + @Ensures({"result >= 0"}) + public static int getMaxReadLength( final List reads ) { + if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); } + + int maxReadLength = 0; + for( final GATKSAMRecord read : reads ) { + maxReadLength = Math.max(maxReadLength, read.getReadLength()); + } + return maxReadLength; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/Parameters.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/Parameters.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/ListFileUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/ListFileUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/TextFormattingUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/TextFormattingUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/XReadLines.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/XReadLines.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java new file mode 100644 index 000000000..fb5564ab3 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -0,0 +1,2206 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.TribbleException; +import org.broad.tribble.util.popgen.HardyWeinbergCalculation; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.io.Serializable; +import java.util.*; + +public class GATKVariantContextUtils { + + private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); + + public static final int DEFAULT_PLOIDY = 2; + public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + + public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + + public final static String MERGE_FILTER_PREFIX = "filterIn"; + public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; + public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; + public final static String MERGE_INTERSECTION = "Intersection"; + + public enum GenotypeMergeType { + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE + } + + public enum FilteredRecordMergeType { + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL + } + + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES + } + + /** + * Refactored out of the AverageAltAlleleLength annotation class + * @param vc the variant context + * @return the average length of the alt allele (a double) + */ + public static double getMeanAltAlleleLength(VariantContext vc) { + double averageLength = 1.0; + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( final Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + averageLength = ( (double) averageLengthNum )/averageLengthDenom; + } + + return averageLength; + } + + /** + * create a genome location, given a variant context + * @param genomeLocParser parser + * @param vc the variant context + * @return the genomeLoc + */ + public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { + return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); + } + + public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { + if (!context.isSNP() || !context.isBiallelic()) + throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); + return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); + } + + /** + * If this is a BiAllelic SNP, is it a transition? + */ + public static boolean isTransition(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + /** + * If this is a BiAllelic SNP, is it a transversion? + */ + public static boolean isTransversion(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + public static boolean isTransition(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + public static boolean isTransversion(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + /** + * Returns a context identical to this with the REF and ALT alleles reverse complemented. + * + * @param vc variant context + * @return new vc + */ + public static VariantContext reverseComplement(VariantContext vc) { + // create a mapping from original allele to reverse complemented allele + HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); + for ( final Allele originalAllele : vc.getAlleles() ) { + Allele newAllele; + if ( originalAllele.isNoCall() ) + newAllele = originalAllele; + else + newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); + alleleMap.put(originalAllele, newAllele); + } + + // create new Genotype objects + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + List newAlleles = new ArrayList<>(); + for ( final Allele allele : genotype.getAlleles() ) { + Allele newAllele = alleleMap.get(allele); + if ( newAllele == null ) + newAllele = Allele.NO_CALL; + newAlleles.add(newAllele); + } + newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); + } + + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); + } + + /** + * Returns true iff VC is an non-complex indel where every allele represents an expansion or + * contraction of a series of identical bases in the reference. + * + * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT + * + * If VC = -/CT, then this function returns true because the CT insertion matches exactly the + * upcoming reference. + * If VC = -/CTA then this function returns false because the CTA isn't a perfect match + * + * Now consider deletions: + * + * If VC = CT/- then again the same logic applies and this returns true + * The case of CTA/- makes no sense because it doesn't actually match the reference bases. + * + * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For + * each insertion allele of n bases, check if that allele matches the next n reference bases. + * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, + * as it must necessarily match the first n bases. If this test returns true for all + * alleles you are a tandem repeat, otherwise you are not. + * + * @param vc + * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return false; + + final Allele ref = vc.getReference(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) + return false; + } + + // we've passed all of the tests, so we are a repeat + return true; + } + + /** + * + * @param vc + * @param refBasesStartingAtVCWithPad + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final boolean VERBOSE = false; + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return null; + + final Allele refAllele = vc.getReference(); + final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); + + byte[] repeatUnit = null; + final ArrayList lengths = new ArrayList<>(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); + + final int[] repetitionCount = result.first; + // repetition count = 0 means allele is not a tandem expansion of context + if (repetitionCount[0] == 0 || repetitionCount[1] == 0) + return null; + + if (lengths.size() == 0) { + lengths.add(repetitionCount[0]); // add ref allele length only once + } + lengths.add(repetitionCount[1]); // add this alt allele's length + + repeatUnit = result.second; + if (VERBOSE) { + System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); + System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); + System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); + System.out.println("RU:"+new String(repeatUnit)); + } + } + + return new Pair, byte[]>(lengths,repeatUnit); + } + + public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { + /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. + Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. + */ + + byte[] longB; + // find first repeat unit based on either ref or alt, whichever is longer + if (altBases.length > refBases.length) + longB = altBases; + else + longB = refBases; + + // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units + // for example, -*,CACA needs to first be decomposed into (CA)2 + final int repeatUnitLength = findRepeatedSubstring(longB); + final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); + + final int[] repetitionCount = new int[2]; + // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) + int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); + repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; + repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; + + return new Pair<>(repetitionCount, repeatUnit); + + } + + /** + * Find out if a string can be represented as a tandem number of substrings. + * For example ACTACT is a 2-tandem of ACT, + * but ACTACA is not. + * + * @param bases String to be tested + * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't + * be represented as one, it will be just the length of the input string) + */ + public static int findRepeatedSubstring(byte[] bases) { + + int repLength; + for (repLength=1; repLength <=bases.length; repLength++) { + final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); + boolean allBasesMatch = true; + for (int start = repLength; start < bases.length; start += repLength ) { + // check that remaining of string is exactly equal to repeat unit + final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); + if (!Arrays.equals(candidateRepeatUnit, basePiece)) { + allBasesMatch = false; + break; + } + } + if (allBasesMatch) + return repLength; + } + + return repLength; + } + + /** + * Helper routine that finds number of repetitions a string consists of. + * For example, for string ATAT and repeat unit AT, number of repetitions = 2 + * @param repeatUnit Substring + * @param testString String to test + * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) + * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's + */ + public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { + int numRepeats = 0; + if (lookForward) { + // look forward on the test string + for (int start = 0; start < testString.length; start += repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 + // look forward on the test string + for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + /** + * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference + * @param ref + * @param alt + * @param refBasesStartingAtVCWithoutPad + * @return + */ + protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { + if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) + return false; // we require one allele be a prefix of another + + if ( ref.length() > alt.length() ) { // we are a deletion + return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); + } else { // we are an insertion + return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); + } + } + + protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { + final String potentialRepeat = l.substring(s.length()); // skip s bases + + for ( int i = 0; i < minNumberOfMatches; i++) { + final int start = i * potentialRepeat.length(); + final int end = (i+1) * potentialRepeat.length(); + if ( ref.length() < end ) + return false; // we ran out of bases to test + final String refSub = ref.substring(start, end); + if ( ! refSub.equals(potentialRepeat) ) + return false; // repeat didn't match, fail + } + + return true; // we passed all tests, we matched + } + + public enum GenotypeAssignmentMethod { + /** + * set all of the genotype GT values to NO_CALL + */ + SET_TO_NO_CALL, + + /** + * Use the subsetted PLs to greedily assigned genotypes + */ + USE_PLS_TO_ASSIGN, + + /** + * Try to match the original GT calls, if at all possible + * + * Suppose I have 3 alleles: A/B/C and the following samples: + * + * original_GT best_match to A/B best_match to A/C + * S1 => A/A A/A A/A + * S2 => A/B A/B A/A + * S3 => B/B B/B A/A + * S4 => B/C A/B A/C + * S5 => C/C A/A C/C + * + * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes + * when split into 2 bi-allelic variants will be het in each, which is good in some cases, + * rather than the undetermined behavior when using the PLs to assign, which could result + * in hom-var or hom-ref for each, depending on the exact PL values. + */ + BEST_MATCH_TO_ORIGINAL, + + /** + * do not even bother changing the GTs + */ + DO_NOT_ASSIGN_GENOTYPES + } + + /** + * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, + final List allelesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); + if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); + + // optimization: if no input genotypes, just exit + if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); + } + + /** + * Figure out which likelihood indexes to use for a selected down set of alleles + * + * @param originalVC the original VariantContext + * @param allelesToUse the subset of alleles to use + * @return a list of PL indexes to use or null if none + */ + private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) + return null; + + return getLikelihoodIndexes(originalVC, alleleIndexesToUse); + } + + /** + * Get the actual likelihoods indexes to use given the corresponding allele indexes + * + * @param originalVC the original VariantContext + * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) + * @return a non-null List + */ + private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + + final List result = new ArrayList<>(30); + + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); + + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) + result.add(PLindex); + } + + return result; + } + + /** + * Given an original VariantContext and a list of alleles from that VC to keep, + * returns a bitset representing which allele indexes should be kept + * + * @param originalVC the original VC + * @param allelesToKeep the list of alleles to keep + * @return non-null bitset + */ + private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { + final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; + final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; + + // the reference Allele is definitely still used + alleleIndexesToKeep[0] = true; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) + alleleIndexesToKeep[i+1] = true; + } + + return alleleIndexesToKeep; + } + + /** + * Create the new GenotypesContext with the subsetted PLs + * + * @param originalGs the original GenotypesContext + * @param vc the original VariantContext + * @param allelesToUse the actual alleles to use with the new Genotypes + * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, + final VariantContext vc, + final List allelesToUse, + final List likelihoodIndexesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // make sure we are seeing the expected number of likelihoods per sample + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + final GenotypeBuilder gb = new GenotypeBuilder(g); + + // create the new likelihoods array from the alleles we are allowed to use + double[] newLikelihoods; + if ( !g.hasLikelihoods() ) { + // we don't have any likelihoods, so we null out PLs and make G ./. + newLikelihoods = null; + gb.noPL(); + } else { + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { + logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + newLikelihoods = null; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( final int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) + gb.noPL(); + else + gb.PL(newLikelihoods); + } + + updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); + newGTs.add(gb.make()); + } + + return newGTs; + } + + private static boolean likelihoodsAreUninformative(final double[] likelihoods) { + return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; + } + + /** + * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod + * + * @param originalGT the original genotype calls, cannot be null + * @param gb the builder where we should put our newly called alleles, cannot be null + * @param assignmentMethod the method to use to do the assignment, cannot be null + * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null + * @param allelesToUse the alleles we are using for our subsetting + */ + public static void updateGenotypeAfterSubsetting(final List originalGT, + final GenotypeBuilder gb, + final GenotypeAssignmentMethod assignmentMethod, + final double[] newLikelihoods, + final List allelesToUse) { + switch ( assignmentMethod ) { + case DO_NOT_ASSIGN_GENOTYPES: + break; + case SET_TO_NO_CALL: + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + break; + case USE_PLS_TO_ASSIGN: + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { + // if there is no mass on the (new) likelihoods, then just no-call the sample + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + } else { + // find the genotype with maximum likelihoods + final int PLindex = MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + break; + case BEST_MATCH_TO_ORIGINAL: + final List best = new LinkedList<>(); + final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument + for ( final Allele originalAllele : originalGT ) { + best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); + } + gb.noGQ(); + gb.noPL(); + gb.alleles(best); + break; + } + } + + /** + * Subset the samples in VC to reference only information with ref call alleles + * + * Preserves DP if present + * + * @param vc the variant context to subset down to + * @param ploidy ploidy to use if a genotype doesn't have any alleles + * @return a GenotypesContext + */ + public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // optimization: if no input genotypes, just exit + if (oldGTs.isEmpty()) return oldGTs; + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); + + final Allele ref = vc.getReference(); + final List diploidRefAlleles = Arrays.asList(ref, ref); + + // create the new genotypes + for ( final Genotype g : vc.getGenotypes() ) { + final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); + final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); + final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); + if ( g.hasDP() ) gb.DP(g.getDP()); + if ( g.hasGQ() ) gb.GQ(g.getGQ()); + newGTs.add(gb.make()); + } + + return newGTs; + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param vc variant context with genotype likelihoods + * @return genotypes context + */ + public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { + return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList<>(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); + VariantContextUtils.calculateChromosomeCounts(builder, true); + final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); + biallelics.add(trimmed); + } + + return biallelics; + } + } + + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; + } + + //TODO consider refactor variant-context merging code so that we share as much as possible between + //TODO simpleMerge and referenceConfidenceMerge + //TODO likely using a separate helper class or hierarchy. + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name. + * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. + * + * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final int originalNumOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + return null; + + if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) + throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); + + if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); + + final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + // Make sure all variant contexts are padded with reference base in case of indels if necessary + List VCs = new ArrayList<>(); + + for (final VariantContext vc : preFilteredVCs) { + if ( ! filteredAreUncalled || vc.isNotFiltered() ) + VCs.add(vc); + } + + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + return null; + + // establish the baseline info from the first VC + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + final Allele refAllele = determineReferenceAllele(VCs); + + final Set alleles = new LinkedHashSet<>(); + final Set filters = new HashSet<>(); + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + + VariantContext longestVC = first; + int depth = 0; + int maxAC = -1; + final Map attributesWithMaxAC = new LinkedHashMap<>(); + double log10PError = CommonInfo.NO_LOG10_PERROR; + boolean anyVCHadFiltersApplied = false; + VariantContext vcWithMaxAC = null; + GenotypesContext genotypes = GenotypesContext.create(); + + // counting the number of filtered and variant VCs + int nFiltered = 0; + + boolean remapped = false; + + // cycle through and add info from the other VCs, making sure the loc/reference matches + for ( final VariantContext vc : VCs ) { + if ( longestVC.getStart() != vc.getStart() ) + throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + + if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) + longestVC = vc; // get the longest location + + nFiltered += vc.isFiltered() ? 1 : 0; + if ( vc.isVariant() ) variantSources.add(vc.getSource()); + + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); + + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); + + // We always take the QUAL of the first VC with a non-MISSING qual for the combined value + if ( log10PError == CommonInfo.NO_LOG10_PERROR ) + log10PError = vc.getLog10PError(); + + filters.addAll(vc.getFilters()); + anyVCHadFiltersApplied |= vc.filtersWereApplied(); + + // + // add attributes + // + // special case DP (add it up) and ID (just preserve it) + // + if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + if ( vc.hasID() ) rsIDs.add(vc.getID()); + if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); + // lets see if the string contains a "," separator + if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { + final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (final String alleleCount : alleleCountArray) { + final int ac = Integer.valueOf(alleleCount.trim()); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } else { + final int ac = Integer.valueOf(rawAlleleCounts); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } + + for (final Map.Entry p : vc.getAttributes().entrySet()) { + final String key = p.getKey(); + final Object value = p.getValue(); + // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); + } + } + } + } + + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD + for ( final VariantContext vc : VCs ) { + if (vc.getAlleles().size() == 1) + continue; + if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { + if ( ! genotypes.isEmpty() ) { + logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", + vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); + } + genotypes = stripPLsAndAD(genotypes); + // this will remove stale AC,AF attributed from vc + VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); + break; + } + } + + // take the VC with the maxAC and pull the attributes into a modifiable map + if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } + + // if at least one record was unfiltered and we want a union, clear all of the filters + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) + filters.clear(); + + + if ( annotateOrigin ) { // we care about where the call came from + String setValue; + if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered + setValue = MERGE_INTERSECTION; + else if ( nFiltered == VCs.size() ) // everything was filtered out + setValue = MERGE_FILTER_IN_ALL; + else if ( variantSources.isEmpty() ) // everyone was reference + setValue = MERGE_REF_IN_ALL; + else { + final LinkedHashSet s = new LinkedHashSet<>(); + for ( final VariantContext vc : VCs ) + if ( vc.isVariant() ) + s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); + setValue = Utils.join("-", s); + } + + if ( setKey != null ) { + attributes.put(setKey, setValue); + if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.put(setKey, setValue); + } + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); + builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); + builder.alleles(alleles); + builder.genotypes(genotypes); + builder.log10PError(log10PError); + if ( anyVCHadFiltersApplied ) { + builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); + } + builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + + // Trim the padded bases of all alleles if necessary + final VariantContext merged = builder.make(); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; + } + + private static Comparable combineAnnotationValues( final List array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * Merges VariantContexts from gVCFs into a single hybrid. + * Assumes that none of the input records are filtered. + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case + * @param removeNonRefSymbolicAllele if true, remove the allele from the merged VC + * @return new VariantContext representing the merge of all VCs or null if it not relevant + */ + public static VariantContext referenceConfidenceMerge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) { + // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs + if ( VCs == null || VCs.size() == 0 ) + return null; + + // establish the baseline info (sometimes from the first VC) + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + + // ref allele + final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); + if ( refAllele == null ) + return null; + + // FinalAlleleSet contains the alleles of the new resulting VC. + // Using linked set in order to guaranteed an stable order: + final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); + // Reference goes first: + finalAlleleSet.add(refAllele); + + final Map attributes = new LinkedHashMap<>(); + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + int depth = 0; + final Map> annotationMap = new LinkedHashMap<>(); + final GenotypesContext genotypes = GenotypesContext.create(); + + final int variantContextCount = VCs.size(); + // In this list we hold the mapping of each variant context alleles. + final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); + // cycle through and add info from the other VCs + for ( final VariantContext vc : VCs ) { + + // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) + final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + + vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) + : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); + } + + // Add to the end if at all required in in the output. + if (!removeNonRefSymbolicAllele) finalAlleleSet.add(NON_REF_SYMBOLIC_ALLELE); + + final List allelesList = new ArrayList<>(finalAlleleSet); + + for ( final Pair> pair : vcAndNewAllelePairs ) { + final VariantContext vc = pair.getFirst(); + final List remappedAlleles = pair.getSecond(); + + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); + + // special case DP (add it up) for all events + if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + else if ( vc.getNSamples() == 1 && vc.getGenotype(0).hasExtendedAttribute("MIN_DP") ) // handle the gVCF case from the HaplotypeCaller + depth += vc.getGenotype(0).getAttributeAsInt("MIN_DP", 0); + + if ( loc.getStart() != vc.getStart() ) + continue; + + // special case ID (just preserve it) + if ( vc.hasID() ) rsIDs.add(vc.getID()); + + // add attributes + addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap); + } + + // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(attributes); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later + + return builder.make(); + } + + /** + * Determines the ref allele given the provided reference base at this position + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning + * @return new Allele or null if no reference allele/base is available + */ + private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + final Allele refAllele = determineReferenceAllele(VCs, loc); + if ( refAllele == null ) + return ( refBase == null ? null : Allele.create(refBase, true) ); + return refAllele; + } + + /** + * Remove the stale attributes from the merged set + * + * @param attributes the attribute map + */ + private static void removeStaleAttributesAfterMerge(final Map attributes) { + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.END_KEY); + } + /** + * Adds attributes to the global map from the new context in a sophisticated manner + * + * @param myAttributes attributes to add from + * @param annotationMap map of annotations for combining later + */ + private static void addReferenceConfidenceAttributes(final Map myAttributes, + final Map> annotationMap) { + for ( final Map.Entry p : myAttributes.entrySet() ) { + final String key = p.getKey(); + final Object value = p.getValue(); + + // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (final NumberFormatException e) { + // nothing to do + } + } + } + + private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + + //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping. + /** + * This method does a couple of things: + *

  • + * remaps the vc alleles considering the differences between the final reference allele and its own reference,
  • + *
  • + * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. + *
+ * + * @param vcAlleles the variant context allele list. + * @param refAllele final reference allele. + * @param finalAlleles where to add the final set of non-ref called alleles. + * @return never {@code null} + */ + private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { + final Allele vcRef = vcAlleles.get(0); + if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); + final byte[] refBases = refAllele.getBases(); + final int extraBaseCount = refBases.length - vcRef.getBases().length; + if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); + final List result = new ArrayList<>(vcAlleles.size()); + + for (final Allele a : vcAlleles) { + if (a.isReference()) { + result.add(refAllele); + } else if (a.isSymbolic()) { + result.add(a); + // we always skip when adding to finalAlleles this is done outside if applies. + if (!a.equals(NON_REF_SYMBOLIC_ALLELE)) + finalAlleles.add(a); + } else if (a.isCalled()) { + final Allele newAllele; + if (extraBaseCount > 0) { + final byte[] oldBases = a.getBases(); + final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); + System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); + newAllele = Allele.create(newBases,false); + } else + newAllele = a; + result.add(newAllele); + finalAlleles.add(newAllele); + } else { // NO_CALL and strange miscellanea + result.add(a); + } + } + return result; + } + + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { + final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); + + for ( final Genotype g : genotypes ) { + newGs.add(removePLsAndAD(g)); + } + + return newGs; + } + + /** + * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles + * from the original VariantContext are no longer present. + * + * @param selectedVC the selected (new) VariantContext + * @param originalVC the original VariantContext + * @return a new non-null GenotypesContext + */ + public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { + final int numNewAlleles = selectedVC.getAlleles().size(); + final int numOriginalAlleles = originalVC.getAlleles().size(); + + // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong + if ( numNewAlleles > numOriginalAlleles ) + throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); + + final GenotypesContext oldGs = selectedVC.getGenotypes(); + + // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything + if ( numNewAlleles == numOriginalAlleles ) + return oldGs; + + final GenotypesContext newGs = fixPLsFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); + + return fixADFromSubsettedAlleles(newGs, originalVC, selectedVC.getAlleles()); + } + + /** + * Fix the PLs for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixPLsFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); + } + + /** + * Fix the AD for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); + } + + return newGTs; + } + + /** + * Fix the AD for the given Genotype + * + * @param genotype the original Genotype + * @param alleleIndexesToUse a bitset describing whether or not to keep a given index + * @param nAllelesToUse how many alleles we are keeping + * @return a non-null Genotype + */ + private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { + // if it ain't broke don't fix it + if ( !genotype.hasAD() ) + return genotype; + + final GenotypeBuilder builder = new GenotypeBuilder(genotype); + + final int[] oldAD = genotype.getAD(); + if ( oldAD.length != alleleIndexesToUse.length ) { + builder.noAD(); + } else { + final int[] newAD = new int[nAllelesToUse]; + int currentIndex = 0; + for ( int i = 0; i < oldAD.length; i++ ) { + if ( alleleIndexesToUse[i] ) + newAD[currentIndex++] = oldAD[i]; + } + builder.AD(newAD); + } + return builder.make(); + } + + static private Allele determineReferenceAllele(final List VCs) { + return determineReferenceAllele(VCs, null); + } + + /** + * Determines the common reference allele + * + * @param VCs the list of VariantContexts + * @param loc if not null, ignore records that do not begin at this start location + * @return possibly null Allele + */ + static private Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { + Allele ref = null; + + for ( final VariantContext vc : VCs ) { + if ( contextMatchesLoc(vc, loc) ) { + final Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } + } + + return ref; + } + + public static boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { + return loc == null || loc.getStart() == vc.getStart(); + } + + static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + final Map map = createAlleleMapping(refAllele, vc, allAlleles); + map.put(vc.getReference(), refAllele); + return new AlleleMapper(map); + } + } + + /** + * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele + * + * The refAllele is the longest reference allele seen at this start site. + * So imagine it is: + * refAllele: ACGTGA + * myRef: ACGT + * myAlt: A + * + * We need to remap all of the alleles in vc to include the extra GA so that + * myRef => refAllele and myAlt => AGA + * + * @param refAllele the new (extended) reference allele + * @param oneVC the Variant Context to extend + * @param currentAlleles the list of alleles already created + * @return a non-null mapping of original alleles to new (extended) ones + */ + private static Map createAlleleMapping(final Allele refAllele, + final VariantContext oneVC, + final Collection currentAlleles) { + final Allele myRef = oneVC.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + + final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + + final Map map = new HashMap<>(); + for ( final Allele a : oneVC.getAlternateAlleles() ) { + if ( isUsableAlternateAllele(a) ) { + Allele extended = Allele.extend(a, extraBases); + for ( final Allele b : currentAlleles ) + if ( extended.equals(b) ) + extended = b; + map.put(a, extended); + } + } + + return map; + } + + static private boolean isUsableAlternateAllele(final Allele allele) { + return ! (allele.isReference() || allele.isSymbolic() ); + } + + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { + if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); + + if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) + return new ArrayList<>(unsortedVCs); + else { + ArrayList sorted = new ArrayList<>(unsortedVCs); + Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); + return sorted; + } + } + + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { + //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE + for ( final Genotype g : oneVC.getGenotypes() ) { + final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); + if ( ! mergedGenotypes.containsSample(name) ) { + // only add if the name is new + Genotype newG = g; + + if ( uniquifySamples || alleleMapping.needsRemapping() ) { + final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); + newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); + } + + mergedGenotypes.add(newG); + } + } + } + + /** + * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele + * + * @param alleles list of alleles to replace + * @return non-null list of alleles + */ + private static List replaceWithNoCalls(final List alleles) { + if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); + + final List result = new ArrayList<>(alleles.size()); + for ( final Allele allele : alleles ) + result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); + return result; + } + + /** + * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. + * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. + * + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + */ + private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, + final VariantContext VC, + final List remappedAlleles, + final List targetAlleles) { + for ( final Genotype g : VC.getGenotypes() ) { + // only add if the name is new + final String name = g.getSampleName(); + if ( !mergedGenotypes.containsSample(name) ) { + + if ( !g.hasPL() ) { + if ( g.isNoCall() ) { + mergedGenotypes.add(g); + continue; + } + throw new UserException("cannot merge genotypes from samples without PLs; sample " + g.getSampleName() + " does not have likelihoods at position " + VC.getChr() + ":" + VC.getStart()); + } + + // we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways + final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); + final int[] PLs = generatePLs(g, indexesOfRelevantAlleles); + final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null; + + final Genotype newG = new GenotypeBuilder(g).name(name).alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(PLs).AD(AD).noGQ().make(); + mergedGenotypes.add(newG); + } + } + } + + /** + * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * If the myAlleles set does not contain "" as an allele, it throws an exception. + * + * @param remappedAlleles the list of alleles to evaluate + * @param targetAlleles the target list of alleles + * @param position position to use for error messages + * @return non-null array of ints representing indexes + */ + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { + + if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + + if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) ) + throw new UserException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); + final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE); + + final int[] indexMapping = new int[targetAlleles.size()]; + + // the reference alleles always match up (even if they don't appear to) + indexMapping[0] = 0; + + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele; + } + + return indexMapping; + } + + /** + * Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs. + * Throws an exception if the Genotype does not contain PLs. + * + * @param genotype the genotype from which to grab PLs + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new PLs + */ + protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) { + if ( !genotype.hasPL() ) + throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs"); + + final int[] originalPLs = genotype.getPL(); + + // assume diploid + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2); + final int[] newPLs = new int[numLikelihoods]; + + for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) { + for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) { + final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]); + if ( originalPLindex >= originalPLs.length ) + throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length); + + final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j); + newPLs[newPLindex] = originalPLs[originalPLindex]; + } + } + + return newPLs; + } + + /** + * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current + * alleles from the original AD. + * + * @param originalAD the original AD to extend + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new AD values + */ + protected static int[] generateAD(final int[] originalAD, final int[] indexesOfRelevantAlleles) { + if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); + + final int numADs = indexesOfRelevantAlleles.length; + if ( numADs == originalAD.length ) + return originalAD; + + final int[] newAD = new int[numADs]; + + for ( int i = 0; i < numADs; i++ ) { + final int oldIndex = indexesOfRelevantAlleles[i]; + if ( oldIndex >= originalAD.length ) + newAD[i] = 0; + else + newAD[i] = originalAD[oldIndex]; + } + + return newAD; + } + + /** + * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() + * + * @param originalIndex1 the index of the first allele + * @param originalIndex2 the index of the second allele + * @return the PL index + */ + protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { + // we need to make sure they are ordered correctly + return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); + } + + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { + return uniquify ? sampleName + "." + trackName : sampleName; + } + + /** + * Trim the alleles in inputVC from the reverse direction + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up + */ + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + return trimAlleles(inputVC, false, true); + } + + /** + * Trim the alleles in inputVC from the forward direction + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up + */ + public static VariantContext forwardTrimAlleles( final VariantContext inputVC ) { + return trimAlleles(inputVC, true, false); + } + + /** + * Trim the alleles in inputVC forward and reverse, as requested + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @param trimForward should we trim up the alleles from the forward direction? + * @param trimReverse should we trim up the alleles from the reverse direction? + * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles + */ + @Ensures("result != null") + public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { + if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); + + if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) + return inputVC; + + // see whether we need to trim common reference base from all alleles + final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; + final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); + final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; + final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); + return vc; + } + + /** + * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and + * the last revTrim bases from the end + * + * @param inputVC a non-null input VC + * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles + * @param revTrim the last revTrim bases of each allele will be clipped off as well + * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles + */ + @Requires({"inputVC != null"}) + @Ensures("result != null") + protected static VariantContext trimAlleles(final VariantContext inputVC, + final int fwdTrimEnd, + final int revTrim) { + if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified + return inputVC; + + final List alleles = new LinkedList<>(); + final Map originalToTrimmedAlleleMap = new HashMap<>(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); + final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); + + final int start = inputVC.getStart() + (fwdTrimEnd + 1); + final VariantContextBuilder builder = new VariantContextBuilder(inputVC); + builder.start(start); + builder.stop(start + alleles.get(0).length() - 1); + builder.alleles(alleles); + builder.genotypes(genotypes); + return builder.make(); + } + + @Requires("originalGenotypes != null && alleleMapper != null") + protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { + final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); + + for ( final Genotype genotype : originalGenotypes ) { + final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); + updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); + } + + return updatedGenotypes; + } + + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + + if ( a.length() - clipping <= 0 || a.length() == 0 ) { + stillClipping = false; + } + else if ( ref.length == clipping ) { + return -1; + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { + stillClipping = false; + } + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + + /** + * Clip out any unnecessary bases off the front of the alleles + * + * The VCF spec represents alleles as block substitutions, replacing AC with A for a + * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that + * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. + * This routine finds an offset among all alleles that can be safely trimmed + * off the left of each allele and still represent the same block substitution. + * + * A/C => A/C + * AC/A => AC/A + * ACC/AC => CC/C + * AGT/CAT => AGT/CAT + * /C => /C + * + * @param unclippedAlleles a non-null list of alleles that we want to clip + * @return the offset into the alleles where we can safely clip, inclusive, or + * -1 if no clipping is tolerated. So, if the result is 0, then we can remove + * the first base of every allele. If the result is 1, we can remove the + * second base. + */ + public static int computeForwardClipping(final List unclippedAlleles) { + // cannot clip unless there's at least 1 alt allele + if ( unclippedAlleles.size() <= 1 ) + return -1; + + // we cannot forward clip any set of alleles containing a symbolic allele + int minAlleleLength = Integer.MAX_VALUE; + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + return -1; + minAlleleLength = Math.min(minAlleleLength, a.length()); + } + + final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); + int indexOflastSharedBase = -1; + + // the -1 to the stop is that we can never clip off the right most base + for ( int i = 0; i < minAlleleLength - 1; i++) { + final byte base = firstAlleleBases[i]; + + for ( final Allele allele : unclippedAlleles ) { + if ( allele.getBases()[i] != base ) + return indexOflastSharedBase; + } + + indexOflastSharedBase = i; + } + + return indexOflastSharedBase; + } + + public static double computeHardyWeinbergPvalue(VariantContext vc) { + if ( vc.getCalledChrCount() == 0 ) + return 0.0; + return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); + } + + public static boolean requiresPaddingBase(final List alleles) { + + // see whether one of the alleles would be null if trimmed through + + for ( final String allele : alleles ) { + if ( allele.isEmpty() ) + return true; + } + + int clipping = 0; + Character currentBase = null; + + while ( true ) { + for ( final String allele : alleles ) { + if ( allele.length() - clipping == 0 ) + return true; + + char myBase = allele.charAt(clipping); + if ( currentBase == null ) + currentBase = myBase; + else if ( currentBase != myBase ) + return false; + } + + clipping++; + currentBase = null; + } + } + + private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { + Map attributes = new HashMap<>(keysToPreserve.size()); + for ( final String key : keysToPreserve ) { + if ( igc.hasAttribute(key) ) + attributes.put(key, igc.getAttribute(key)); + } + return attributes; + } + + /** + * @deprecated use variant context builder version instead + * @param vc the variant context + * @param keysToPreserve the keys to preserve + * @return a pruned version of the original variant context + */ + @Deprecated + public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { + return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); + } + + public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { + final VariantContext vc = builder.make(); + if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); + + // VC info + final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); + + // Genotypes + final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + final GenotypeBuilder gb = new GenotypeBuilder(g); + // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ + gb.noAD().noDP().noPL().noAttributes(); + genotypes.add(gb.make()); + } + + return builder.genotypes(genotypes).attributes(attributes); + } + + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (final Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } + + public static Map> separateVariantContextsByType( final Collection VCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } + + final HashMap> mappedVCs = new HashMap<>(); + for ( final VariantContext vc : VCs ) { + VariantContext.Type vcType = vc.getType(); + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } + vcType = VariantContext.Type.NO_VARIATION; + } + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (final VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vcType)) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(type); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + break; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(vc); + } + } + + return mappedVCs; + } + + public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { + if ( allowedAttributes == null ) + return vc; + + final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + final Map attrs = new HashMap<>(); + for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + if ( allowedAttributes.contains(attr.getKey()) ) + attrs.put(attr.getKey(), attr.getValue()); + } + newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); + } + + return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); + } + + protected static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList<>(); + for ( final Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + + /** + * @return the list of unique values + */ + public List getUniqueMappedAlleles() { + if ( map == null ) + return Collections.emptyList(); + return new ArrayList<>(new HashSet<>(map.values())); + } + } + + private static class CompareByPriority implements Comparator, Serializable { + List priorityListOfVCs; + public CompareByPriority(List priorityListOfVCs) { + this.priorityListOfVCs = priorityListOfVCs; + } + + private int getIndex(VariantContext vc) { + int i = priorityListOfVCs.indexOf(vc.getSource()); + if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); + return i; + } + + public int compare(VariantContext vc1, VariantContext vc2) { + return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); + } + } + + /** + * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles + * + * @param name the name of the VC + * @param contig the contig for the VC + * @param start the start of the VC + * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the + * alt. Will compute the stop of the VC from the length of the reference allele + * @return a non-null VariantContext + */ + public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List alleleStrings) { + if ( alleleStrings == null || alleleStrings.isEmpty() ) + throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); + + final List alleles = new LinkedList<>(); + final int length = alleleStrings.get(0).length(); + + boolean first = true; + for ( final String alleleString : alleleStrings ) { + alleles.add(Allele.create(alleleString, first)); + first = false; + } + return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); + } + + /** + * Splits the alleles for the provided variant context into its primitive parts. + * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. + * Currently works only for MNPs. + * + * @param vc the non-null VC to split + * @return a non-empty list of VCs split into primitive parts or the original VC otherwise + */ + public static List splitIntoPrimitiveAlleles(final VariantContext vc) { + if ( vc == null ) + throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); + + if ( !vc.isBiallelic() ) + throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); + + // currently only works for MNPs + if ( !vc.isMNP() ) + return Arrays.asList(vc); + + final byte[] ref = vc.getReference().getBases(); + final byte[] alt = vc.getAlternateAllele(0).getBases(); + + if ( ref.length != alt.length ) + throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); + + final List result = new ArrayList<>(ref.length); + + for ( int i = 0; i < ref.length; i++ ) { + + // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) + if ( ref[i] != alt[i] ) { + + // create the ref and alt SNP alleles + final Allele newRefAllele = Allele.create(ref[i], true); + final Allele newAltAllele = Allele.create(alt[i], false); + + // create a new VariantContext with the new SNP alleles + final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); + + // create new genotypes with updated alleles + final Map alleleMap = new HashMap<>(); + alleleMap.put(vc.getReference(), newRefAllele); + alleleMap.put(vc.getAlternateAllele(0), newAltAllele); + final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); + + result.add(newVC.genotypes(newGenotypes).make()); + } + } + + if ( result.isEmpty() ) + result.add(vc); + + return result; + } + + /** + * Are vc1 and 2 equal including their position and alleles? + * @param vc1 non-null VariantContext + * @param vc2 non-null VariantContext + * @return true if vc1 and vc2 are equal, false otherwise + */ + public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { + if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); + if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); + + if ( vc1.getStart() != vc2.getStart() ) return false; + if ( vc1.getEnd() != vc2.getEnd() ) return false; + if ( ! vc1.getChr().equals(vc2.getChr())) return false; + if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; + return true; + } + + /** + * Returns the absolute 0-based index of an allele. + * + *

+ * If the allele is equal to the reference, the result is 0, if it equal to the first alternative the result is 1 + * and so forth. + *

+ * Therefore if you want the 0-based index within the alternative alleles you need to do the following: + * + *

+ * You can indicate whether the Java object reference comparator {@code ==} can be safelly used by setting {@code useEquals} to {@code false}. + * + * @param vc the target variant context. + * @param allele the target allele. + * @param ignoreRefState whether the reference states of the allele is important at all. Has no effect if {@code useEquals} is {@code false}. + * @param considerRefAllele whether the reference allele should be considered. You should set it to {@code false} if you are only interested in alternative alleles. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and {@link VariantContext#getNAlleles()} {@code -1} otherwise. + */ + public static int indexOfAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele, final boolean useEquals) { + if (allele == null) throw new IllegalArgumentException(); + return useEquals ? indexOfEqualAllele(vc,allele,ignoreRefState,considerRefAllele) : indexOfSameAllele(vc,allele,considerRefAllele); + } + + /** + * Returns the relative 0-based index of an alternative allele. + *

+ * The the query allele is the same as the first alternative allele, the result is 0, + * if it is equal to the second 1 and so forth. + * + * + *

+ * Notice that the ref-status of the query {@code allele} is ignored. + * + * @param vc the target variant context. + * @param allele the query allele. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and the number + * of alternative alleles - 1. + */ + public static int indexOfAltAllele(final VariantContext vc, final Allele allele, final boolean useEquals) { + final int absoluteIndex = indexOfAllele(vc,allele,true,false,useEquals); + return absoluteIndex == -1 ? -1 : absoluteIndex - 1; + } + + // Impements index search using equals. + private static int indexOfEqualAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, + final boolean considerRefAllele) { + int i = 0; + for (final Allele a : vc.getAlleles()) + if (a.equals(allele,ignoreRefState)) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + return -1; + } + + // Implements index search using ==. + private static int indexOfSameAllele(final VariantContext vc, final Allele allele, final boolean considerRefAllele) { + int i = 0; + + for (final Allele a : vc.getAlleles()) + if (a == allele) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + + return -1; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java diff --git a/public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java diff --git a/public/keys/GATK_public.key b/public/gatk-framework/src/main/resources/GATK_public.key similarity index 100% rename from public/keys/GATK_public.key rename to public/gatk-framework/src/main/resources/GATK_public.key diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key diff --git a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R diff --git a/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/utils/recalibration/BQSR.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/utils/recalibration/BQSR.R diff --git a/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java b/public/gatk-framework/src/test/java/net/sf/samtools/GATKBAMFileSpanUnitTest.java similarity index 100% rename from public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java rename to public/gatk-framework/src/test/java/net/sf/samtools/GATKBAMFileSpanUnitTest.java diff --git a/public/java/test/net/sf/samtools/GATKChunkUnitTest.java b/public/gatk-framework/src/test/java/net/sf/samtools/GATKChunkUnitTest.java similarity index 100% rename from public/java/test/net/sf/samtools/GATKChunkUnitTest.java rename to public/gatk-framework/src/test/java/net/sf/samtools/GATKChunkUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java new file mode 100644 index 000000000..e8aed7d50 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java @@ -0,0 +1,527 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting; + +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.apache.log4j.spi.LoggingEvent; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.crypt.CryptUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.io.IOUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.bcf2.BCF2Codec; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.SkipException; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * + * User: aaron + * Date: Apr 14, 2009 + * Time: 10:24:30 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

+ * Class BaseTest + *

+ * This is the base test class for all of our test cases. All test cases should extend from this + * class; it sets up the logger, and resolves the location of directories that we rely on. + */ +@SuppressWarnings("unchecked") +public abstract class BaseTest { + /** our log, which we want to capture anything from org.broadinstitute.sting */ + public static final Logger logger = CommandLineUtils.getStingLogger(); + + public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; + public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; + //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; + public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; + public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; + public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; + public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; + public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; + public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; + public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; + + public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; + + public static final String dbsnpDataLocation = GATKDataLocation; + public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; + public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; + public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; + public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; + + public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; + public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; + + public static final String intervalsLocation = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/"; + public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; + public static final String hg19Chr20Intervals = GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; + + public static final boolean REQUIRE_NETWORK_CONNECTION = false; + private static final String networkTempDirRoot = "/broad/hptmp/"; + private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists(); + private static final File networkTempDirFile; + + private static final String privateTestDirRelative = "private/testdata/"; + public static final String privateTestDir = new File(privateTestDirRelative).getAbsolutePath() + "/"; + protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, ""); + + private static final String publicTestDirRelative = "public/testdata/"; + public static final String publicTestDir = new File(publicTestDirRelative).getAbsolutePath() + "/"; + protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); + + public static final String keysDataLocation = validationDataLocation + "keys/"; + public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; + + public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; + + public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; + + public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run", "").equals("run"); + + /** before the class starts up */ + static { + // setup a basic log configuration + CommandLineUtils.configureConsoleLogging(); + + // setup our log layout + PatternLayout layout = new PatternLayout(); + layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n"); + + // now set the layout of all the loggers to our layout + CommandLineUtils.setLayout(logger, layout); + + // Set the Root logger to only output warnings. + logger.setLevel(Level.WARN); + + if (networkTempDirRootExists) { + networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name"))); + networkTempDirFile.deleteOnExit(); + } else { + networkTempDirFile = null; + } + + + if ( REQUIRE_NETWORK_CONNECTION ) { + // find our file sources + if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { + logger.fatal("We can't locate the reference directories. Aborting!"); + throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); + } + } + } + + /** + * Simple generic utility class to creating TestNG data providers: + * + * 1: inherit this class, as in + * + * private class SummarizeDifferenceTest extends TestDataProvider { + * public SummarizeDifferenceTest() { + * super(SummarizeDifferenceTest.class); + * } + * ... + * } + * + * Provide a reference to your class to the TestDataProvider constructor. + * + * 2: Create instances of your subclass. Return from it the call to getTests, providing + * the class type of your test + * + * + * {@literal @}DataProvider(name = "summaries") + * public Object[][] createSummaries() { + * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + * } + * + * + * This class magically tracks created objects of this + */ + public static class TestDataProvider { + private static final Map> tests = new HashMap<>(); + protected String name; + + /** + * Create a new TestDataProvider instance bound to the class variable C + */ + public TestDataProvider(Class c, String name) { + if ( ! tests.containsKey(c) ) + tests.put(c, new ArrayList<>()); + tests.get(c).add(this); + this.name = name; + } + + public TestDataProvider(Class c) { + this(c, ""); + } + + public void setName(final String name) { + this.name = name; + } + + /** + * Return all of the data providers in the form expected by TestNG of type class C + * @param c + * @return + */ + public static Object[][] getTests(Class c) { + List params2 = new ArrayList(); + for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + + @Override + public String toString() { + return "TestDataProvider("+name+")"; + } + } + + /** + * test if the file exists + * + * @param file name as a string + * @return true if it exists + */ + public static boolean fileExist(String file) { + File temp = new File(file); + return temp.exists(); + } + + /** + * this appender looks for a specific message in the log4j stream. + * It can be used to verify that a specific message was generated to the logging system. + */ + public static class ValidationAppender extends AppenderSkeleton { + + private boolean foundString = false; + private String targetString = ""; + + public ValidationAppender(String target) { + targetString = target; + } + + @Override + protected void append(LoggingEvent loggingEvent) { + if (loggingEvent.getMessage().equals(targetString)) + foundString = true; + } + + public void close() { + // do nothing + } + + public boolean requiresLayout() { + return false; + } + + public boolean foundString() { + return foundString; + } + } + + /** + * Creates a temp file that will be deleted on exit after tests are complete. + * @param name Prefix of the file. + * @param extension Extension to concat to the end of the file. + * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. + */ + public static File createTempFile(String name, String extension) { + try { + File file = File.createTempFile(name, extension); + file.deleteOnExit(); + return file; + } catch (IOException ex) { + throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex); + } + } + + /** + * Creates a temp file that will be deleted on exit after tests are complete. + * @param name Name of the file. + * @return A file in the network temporary directory with name, which will be deleted after the program exits. + * @throws SkipException when the network is not available. + */ + public static File tryCreateNetworkTempFile(String name) { + if (!networkTempDirRootExists) + throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot); + File file = new File(networkTempDirFile, name); + file.deleteOnExit(); + return file; + } + + /** + * Log this message so that it shows up inline during output as well as in html reports + * + * @param message + */ + public static void log(final String message) { + Reporter.log(message, true); + } + + private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; + + public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { + Assert.assertTrue(actual instanceof Double, "Not a double"); + assertEqualsDoubleSmart((double)(Double)actual, (double)expected); + } + + public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { + Assert.assertTrue(actual instanceof Double, "Not a double"); + assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); + } + + public static final void assertEqualsDoubleSmart(final double actual, final double expected) { + assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); + } + + public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { + final Set actualSet = new HashSet(actual); + final Set expectedSet = new HashSet(expected); + Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + assertEqualsDoubleSmart(actual, expected, tolerance, null); + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { + if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); + else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); + else { + final double delta = Math.abs(actual - expected); + final double ratio = Math.abs(actual / expected - 1.0); + Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + + " not within tolerance " + tolerance + + (message == null ? "" : "message: " + message)); + } + } + + public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { + Assert.assertNotNull(actual, "VariantContext expected not null"); + Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); + Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); + Assert.assertEquals(actual.getID(), expected.getID(), "id"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); + + assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); + Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); + assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); + assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); + + Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); + if ( expected.hasGenotypes() ) { + assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); + final Set samples = expected.getSampleNames(); + for ( final String sample : samples ) { + assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); + } + } + } + + public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { + final Iterator actualIT = actual.iterator(); + final Iterator expectedIT = expected.iterator(); + + while ( expectedIT.hasNext() ) { + final VariantContext expectedVC = expectedIT.next(); + if ( expectedVC == null ) + continue; + + VariantContext actualVC; + do { + Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); + actualVC = actualIT.next(); + } while ( actualIT.hasNext() && actualVC == null ); + + if ( actualVC == null ) + Assert.fail("Too few records in actual"); + + assertVariantContextsAreEqual(actualVC, expectedVC); + } + Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); + } + + + public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { + Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); + Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); + Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); + + // filters are the same + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); + + // inline attributes + Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); + Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); + Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); + Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); + Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); + Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); + + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); + Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); + + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); + assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); + Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); + Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); + } + + public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { + Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); + + // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? + //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); + final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); + final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); + for ( int i = 0; i < actualLines.size(); i++ ) { + Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); + } + } + + public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { + final Pair> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); + final Pair> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); + assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); + assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); + } + + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { + if ( expected instanceof Double ) { + // must be very tolerant because doubles are being rounded to 2 sig figs + assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); + } else + Assert.assertEquals(actual, expected, "Attribute " + key); + } + + private static void assertAttributesEquals(final Map actual, Map expected) { + final Set expectedKeys = new HashSet(expected.keySet()); + + for ( final Map.Entry act : actual.entrySet() ) { + final Object actualValue = act.getValue(); + if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { + final Object expectedValue = expected.get(act.getKey()); + if ( expectedValue instanceof List ) { + final List expectedList = (List)expectedValue; + Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); + final List actualList = (List)actualValue; + Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); + for ( int i = 0; i < expectedList.size(); i++ ) + assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); + } else + assertAttributeEquals(act.getKey(), actualValue, expectedValue); + } else { + // it's ok to have a binding in x -> null that's absent in y + Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); + } + expectedKeys.remove(act.getKey()); + } + + // now expectedKeys contains only the keys found in expected but not in actual, + // and they must all be null + for ( final String missingExpected : expectedKeys ) { + final Object value = expected.get(missingExpected); + Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); + } + } + + private static final boolean isMissing(final Object value) { + if ( value == null ) return true; + else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; + else if ( value instanceof List ) { + // handles the case where all elements are null or the list is empty + for ( final Object elt : (List)value) + if ( elt != null ) + return false; + return true; + } else + return false; + } + + /** + * Checks whether two double array contain the same values or not. + * @param actual actual produced array. + * @param expected expected array. + * @param tolerance maximum difference between double value to be consider equivalent. + */ + protected static void assertEqualsDoubleArray(final double[] actual, final double[] expected, final double tolerance) { + if (expected == null) + Assert.assertNull(actual); + else { + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length,"array length"); + } + for (int i = 0; i < actual.length; i++) + Assert.assertEquals(actual[i],expected[i],tolerance,"array position " + i); + } +} diff --git a/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/ExampleToCopyUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/ExampleToCopyUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java new file mode 100644 index 000000000..4c0f8b11a --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java @@ -0,0 +1,313 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/18/11 + * Time: 9:10 AM + * + * Utilities for manipulating the MD5 database of previous results + */ +public class MD5DB { + public static final Logger logger = Logger.getLogger(MD5DB.class); + + /** + * Subdirectory under the ant build directory where we store integration test md5 results + */ + private static final int MAX_RECORDS_TO_READ = 1000000; + private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1; + public static final String LOCAL_MD5_DB_DIR = "integrationtests"; + public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; + + // tracking and emitting a data file of origina and new md5s + private final File MD5MismatchesFile; + private final PrintStream md5MismatchStream; + + public MD5DB() { + this(new File(MD5DB.LOCAL_MD5_DB_DIR + "/md5mismatches.txt")); + } + + public MD5DB(final File MD5MismatchesFile) { + this.MD5MismatchesFile = MD5MismatchesFile; + + ensureMd5DbDirectory(); + + logger.debug("Creating md5 mismatch db at " + MD5MismatchesFile); + try { + md5MismatchStream = new PrintStream(new FileOutputStream(MD5MismatchesFile)); + md5MismatchStream.printf("%s\t%s\t%s%n", "expected", "observed", "test"); + } catch ( FileNotFoundException e ) { + throw new ReviewedStingException("Failed to open md5 mismatch file", e); + } + + } + + public void close() { + if ( md5MismatchStream != null ) { + logger.debug("Closeing md5 mismatch db at " + MD5MismatchesFile); + md5MismatchStream.close(); + } + } + + // ---------------------------------------------------------------------- + // + // MD5 DB stuff + // + // ---------------------------------------------------------------------- + + /** + * Create the MD5 file directories if necessary + */ + private void ensureMd5DbDirectory() { + File dir = new File(LOCAL_MD5_DB_DIR); + if ( ! dir.exists() ) { + System.out.printf("##### Creating MD5 db %s%n", LOCAL_MD5_DB_DIR); + if ( ! dir.mkdir() ) { + // Need to check AGAIN whether the dir exists, because we might be doing multi-process parallelism + // within the same working directory, and another GATK instance may have come along and created the + // directory between the calls to exists() and mkdir() above. + if ( ! dir.exists() ) { + throw new ReviewedStingException("Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR); + } + } + } + } + + /** + * Returns the path to an already existing file with the md5 contents, or valueIfNotFound + * if no such file exists in the db. + * + * @param md5 + * @param valueIfNotFound + * @return + */ + public String getMD5FilePath(final String md5, final String valueIfNotFound) { + // we prefer the global db to the local DB, so match it first + for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { + File f = getFileForMD5(md5, dir); + if ( f.exists() && f.canRead() ) + return f.getAbsolutePath(); + } + + return valueIfNotFound; + } + + /** + * Utility function that given a file's md5 value and the path to the md5 db, + * returns the canonical name of the file. For example, if md5 is XXX and db is YYY, + * this will return YYY/XXX.integrationtest + * + * @param md5 + * @param dbPath + * @return + */ + private File getFileForMD5(final String md5, final String dbPath) { + final String basename = String.format("%s.integrationtest", md5); + return new File(dbPath + "/" + basename); + } + + /** + * Copies the results file with md5 value to its canonical file name and db places + * + * @param md5 + * @param resultsFile + */ + private void updateMD5Db(final String md5, final File resultsFile) { + copyFileToDB(getFileForMD5(md5, LOCAL_MD5_DB_DIR), resultsFile); + copyFileToDB(getFileForMD5(md5, GLOBAL_MD5_DB_DIR), resultsFile); + } + + /** + * Low-level utility routine that copies resultsFile to dbFile + * @param dbFile + * @param resultsFile + */ + private void copyFileToDB(File dbFile, final File resultsFile) { + if ( ! dbFile.exists() ) { + // the file isn't already in the db, copy it over + System.out.printf("##### Updating MD5 file: %s%n", dbFile.getPath()); + try { + FileUtils.copyFile(resultsFile, dbFile); + } catch ( IOException e ) { + System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); + } + } else { + //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); + } + } + + /** + * Returns the byte[] of the entire contents of file, for md5 calculations + * @param file + * @return + * @throws IOException + */ + private static byte[] getBytesFromFile(File file) throws IOException { + InputStream is = new FileInputStream(file); + + // Get the size of the file + long length = file.length(); + + if (length > Integer.MAX_VALUE) { + // File is too large + } + + // Create the byte array to hold the data + byte[] bytes = new byte[(int) length]; + + // Read in the bytes + int offset = 0; + int numRead = 0; + while (offset < bytes.length + && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) { + offset += numRead; + } + + // Ensure all the bytes have been read in + if (offset < bytes.length) { + throw new IOException("Could not completely read file " + file.getName()); + } + + // Close the input stream and return bytes + is.close(); + return bytes; + } + + public static class MD5Match { + public final String actualMD5, expectedMD5; + public final String failMessage; + public final String diffEngineOutput; + public final boolean failed; + + public MD5Match(final String actualMD5, final String expectedMD5, final String failMessage, final String diffEngineOutput, final boolean failed) { + this.actualMD5 = actualMD5; + this.expectedMD5 = expectedMD5; + this.failMessage = failMessage; + this.diffEngineOutput = diffEngineOutput; + this.failed = failed; + } + } + + /** + * Tests a file MD5 against an expected value, returning an MD5Match object containing a description of the + * match or mismatch. In case of a mismatch, outputs a description of the mismatch to various log files/streams. + * + * NOTE: This function WILL NOT throw an exception if the MD5s are different. + * + * @param testName Name of the test. + * @param testClassName Name of the class that contains the test. + * @param resultsFile File to MD5. + * @param expectedMD5 Expected MD5 value. + * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. + * @return an MD5Match object containing a description of the match/mismatch. Will have its "failed" field set + * to true if there was a mismatch (unless we're using the "parameterize" argument) + */ + public MD5Match testFileMD5(final String testName, final String testClassName, final File resultsFile, final String expectedMD5, final boolean parameterize) { + final String actualMD5 = calculateFileMD5(resultsFile); + String diffEngineOutput = ""; + String failMessage = ""; + boolean failed = false; + + // copy md5 to integrationtests + updateMD5Db(actualMD5, resultsFile); + + if (parameterize || expectedMD5.equals("")) { + BaseTest.log(String.format("PARAMETERIZATION: file %s has md5 = %s", resultsFile, actualMD5)); + } else if ( ! expectedMD5.equals(actualMD5) ) { + failed = true; + failMessage = String.format("%s:%s has mismatching MD5s: expected=%s observed=%s", testClassName, testName, expectedMD5, actualMD5); + diffEngineOutput = logMD5MismatchAndGetDiffEngineOutput(testName, testClassName, expectedMD5, actualMD5); + } + + return new MD5Match(actualMD5, expectedMD5, failMessage, diffEngineOutput, failed); + } + + /** + * Calculates the MD5 for the specified file and returns it as a String + * + * @param file file whose MD5 to calculate + * @return file's MD5 in String form + * @throws RuntimeException if the file could not be read + */ + public String calculateFileMD5( final File file ) { + try { + return Utils.calcMD5(getBytesFromFile(file)); + } + catch ( Exception e ) { + throw new RuntimeException("Failed to read bytes from file: " + file + " for MD5 calculation", e); + } + } + + /** + * Logs a description (including diff engine output) of the MD5 mismatch between the expectedMD5 + * and actualMD5 to a combination of BaseTest.log(), the md5MismatchStream, and stdout, then returns + * the diff engine output. + * + * @param testName name of the test that generated the mismatch + * @param testClassName name of the class containing the test that generated the mismatch + * @param expectedMD5 the MD5 we were expecting from this test + * @param actualMD5 the MD5 we actually calculated from the test output + * @return the diff engine output produced while logging the description of the mismatch + */ + private String logMD5MismatchAndGetDiffEngineOutput(final String testName, final String testClassName, final String expectedMD5, final String actualMD5) { + System.out.printf("##### Test %s:%s is going to fail #####%n", testClassName, testName); + String pathToExpectedMD5File = getMD5FilePath(expectedMD5, "[No DB file found]"); + String pathToFileMD5File = getMD5FilePath(actualMD5, "[No DB file found]"); + BaseTest.log(String.format("expected %s", expectedMD5)); + BaseTest.log(String.format("calculated %s", actualMD5)); + BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File)); + + md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, actualMD5, testName); + md5MismatchStream.flush(); + + // inline differences + String diffEngineOutput = ""; + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final PrintStream ps = new PrintStream(baos); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false); + boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); + if ( success ) { + diffEngineOutput = baos.toString(); + BaseTest.log(diffEngineOutput); + System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", + pathToExpectedMD5File, pathToFileMD5File); + } + ps.close(); + + return diffEngineOutput; + } +} diff --git a/public/java/test/org/broadinstitute/sting/MD5Mismatch.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5Mismatch.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/MD5Mismatch.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5Mismatch.java diff --git a/public/java/test/org/broadinstitute/sting/StingTextReporter.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/StingTextReporter.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/StingTextReporter.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/StingTextReporter.java diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/TestNGTestTransformer.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/TestNGTestTransformer.java diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/WalkerTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/WalkerTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/WalkerTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java new file mode 100644 index 000000000..29ba95963 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java @@ -0,0 +1,1092 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.apache.commons.io.FileUtils; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.EnumSet; +/** + * Test suite for the parsing engine. + */ +public class ParsingEngineUnitTest extends BaseTest { + /** we absolutely cannot have this file existing, or we'll fail the UnitTest */ + private final static String NON_EXISTANT_FILENAME_VCF = "this_file_should_not_exist_on_disk_123456789.vcf"; + private ParsingEngine parsingEngine; + + @BeforeMethod + public void setUp() { + parsingEngine = new ParsingEngine(null); + RodBinding.resetNameCounter(); + } + + private class InputFileArgProvider { + @Argument(fullName="input_file",doc="input file",shortName="I") + public String inputFile; + } + + @Test + public void shortNameArgumentTest() { + final String[] commandLine = new String[] {"-I","na12878.bam"}; + + parsingEngine.addArgumentSource( InputFileArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + InputFileArgProvider argProvider = new InputFileArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); + } + + @Test + public void multiCharShortNameArgumentTest() { + final String[] commandLine = new String[] {"-out","out.txt"}; + + parsingEngine.addArgumentSource( MultiCharShortNameArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + MultiCharShortNameArgProvider argProvider = new MultiCharShortNameArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.outputFile,"out.txt","Argument is not correctly initialized"); + } + + + private class MultiCharShortNameArgProvider { + @Argument(shortName="out", doc="output file") + public String outputFile; + } + + @Test + public void longNameArgumentTest() { + final String[] commandLine = new String[] {"--input_file", "na12878.bam"}; + + parsingEngine.addArgumentSource( InputFileArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + InputFileArgProvider argProvider = new InputFileArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); + } + + @Test + public void extraWhitespaceTest() { + final String[] commandLine = new String[] {" --input_file ", "na12878.bam"}; + + parsingEngine.addArgumentSource( InputFileArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + InputFileArgProvider argProvider = new InputFileArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); + } + + @Test + public void primitiveArgumentTest() { + final String[] commandLine = new String[] {"--foo", "5"}; + + parsingEngine.addArgumentSource( PrimitiveArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + PrimitiveArgProvider argProvider = new PrimitiveArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.foo, 5, "Argument is not correctly initialized"); + } + + @Test(expectedExceptions=MissingArgumentValueException.class) + public void primitiveArgumentNoValueTest() { + final String[] commandLine = new String[] {"--foo"}; + + parsingEngine.addArgumentSource( PrimitiveArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + PrimitiveArgProvider argProvider = new PrimitiveArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.foo, 5, "Argument is not correctly initialized"); + } + + private class PrimitiveArgProvider { + @Argument(doc="simple integer") + int foo; + } + + @Test + public void flagTest() { + final String[] commandLine = new String[] {"--all_loci"}; + + parsingEngine.addArgumentSource( AllLociArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + AllLociArgProvider argProvider = new AllLociArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertTrue(argProvider.allLoci,"Argument is not correctly initialized"); + } + + private class AllLociArgProvider { + @Argument(fullName="all_loci",shortName="A", doc="all loci") + public boolean allLoci = false; + } + + @Test + public void arrayTest() { + final String[] commandLine = new String[] {"-I", "foo.txt", "--input_file", "bar.txt"}; + + parsingEngine.addArgumentSource( MultiValueArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + MultiValueArgProvider argProvider = new MultiValueArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.inputFile.length, 2, "Argument array is of incorrect length"); + Assert.assertEquals(argProvider.inputFile[0],"foo.txt","1st filename is incorrect"); + Assert.assertEquals(argProvider.inputFile[1],"bar.txt","2nd filename is incorrect"); + } + + private class MultiValueArgProvider { + @Argument(fullName="input_file",shortName="I", doc="input file") + public String[] inputFile; + } + + @Test + public void enumTest() { + final String[] commandLine = new String[] { "--test_enum", "TWO" }; + + parsingEngine.addArgumentSource( EnumArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + EnumArgProvider argProvider = new EnumArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.testEnum, TestEnum.TWO, "Enum value is not correct"); + } + + @Test + public void enumMixedCaseTest() { + final String[] commandLine = new String[] { "--test_enum", "oNe" }; + + parsingEngine.addArgumentSource( EnumArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + EnumArgProvider argProvider = new EnumArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.testEnum, TestEnum.ONE, "Enum value is not correct"); + } + + @Test + public void enumDefaultTest() { + final String[] commandLine = new String[] {}; + + parsingEngine.addArgumentSource( EnumArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + EnumArgProvider argProvider = new EnumArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.testEnum, TestEnum.THREE, "Enum value is not correct"); + } + + public enum TestEnum { ONE, TWO, THREE } + + private class EnumArgProvider { + @Argument(fullName="test_enum",shortName="ti",doc="test enum",required=false) + public TestEnum testEnum = TestEnum.THREE; + } + + @Test + public void typedCollectionTest() { + final String[] commandLine = new String[] { "-N","2","-N","4","-N","6","-N","8","-N","10" }; + + parsingEngine.addArgumentSource( IntegerListArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + IntegerListArgProvider argProvider = new IntegerListArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNotNull(argProvider.integers, "Argument array is null"); + Assert.assertEquals(argProvider.integers.size(), 5, "Argument array is of incorrect length"); + Assert.assertEquals(argProvider.integers.get(0).intValue(), 2, "1st integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(1).intValue(), 4, "2nd integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(2).intValue(), 6, "3rd integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(3).intValue(), 8, "4th integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(4).intValue(), 10, "5th integer is incorrect"); + } + + private class IntegerListArgProvider { + @Argument(fullName="integer_list",shortName="N",doc="integer list") + public List integers; + } + + @Test + public void untypedCollectionTest() { + final String[] commandLine = new String[] { "-N","2","-N","4","-N","6","-N","8","-N","10" }; + + parsingEngine.addArgumentSource( UntypedListArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + UntypedListArgProvider argProvider = new UntypedListArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNotNull(argProvider.integers, "Argument array is null"); + Assert.assertEquals(argProvider.integers.size(), 5, "Argument array is of incorrect length"); + Assert.assertEquals(argProvider.integers.get(0), "2", "1st integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(1), "4", "2nd integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(2), "6", "3rd integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(3), "8", "4th integer is incorrect"); + Assert.assertEquals(argProvider.integers.get(4), "10", "5th integer is incorrect"); + } + + private class UntypedListArgProvider { + @Argument(fullName="untyped_list",shortName="N", doc="untyped list") + public List integers; + } + + @Test(expectedExceptions=MissingArgumentException.class) + public void requiredArgTest() { + final String[] commandLine = new String[0]; + + parsingEngine.addArgumentSource( RequiredArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + private class RequiredArgProvider { + @Argument(required=true,doc="value") + public Integer value; + } + + @Test + public void defaultValueTest() { + // First try getting the default. + String[] commandLine = new String[0]; + + parsingEngine.addArgumentSource( DefaultValueArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + DefaultValueArgProvider argProvider = new DefaultValueArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.value.intValue(), 42, "Default value is not correctly initialized"); + + // Then try to override it. + commandLine = new String[] { "--value", "27" }; + + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.value.intValue(), 27, "Default value is not correctly initialized"); + } + + private class DefaultValueArgProvider { + @Argument(doc="value",required=false) + public Integer value = 42; + } + + @Test + public void disableValidationOfRequiredArgTest() { + final String[] commandLine = new String[0]; + + parsingEngine.addArgumentSource( RequiredArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate( EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument) ); + + RequiredArgProvider argProvider = new RequiredArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider ); + + Assert.assertNull(argProvider.value, "Value should have remain unset"); + } + + @Test + public void unrequiredArgTest() { + final String[] commandLine = new String[0]; + + parsingEngine.addArgumentSource( UnrequiredArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + UnrequiredArgProvider argProvider = new UnrequiredArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNull(argProvider.value, "Value was unrequired and unspecified; contents should be null"); + } + + private class UnrequiredArgProvider { + @Argument(required=false,doc="unrequired value") + public Integer value; + } + + @Test(expectedExceptions=InvalidArgumentException.class) + public void invalidArgTest() { + final String[] commandLine = new String[] { "--foo" }; + + parsingEngine.addArgumentSource( UnrequiredArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + @Test(expectedExceptions= ReviewedStingException.class) + public void duplicateLongNameTest() { + parsingEngine.addArgumentSource( DuplicateLongNameProvider.class ); + } + + private class DuplicateLongNameProvider { + @Argument(fullName="myarg",doc="my arg") + public Integer foo; + + @Argument(fullName="myarg", doc="my arg") + public Integer bar; + } + + @Test(expectedExceptions= ReviewedStingException.class) + public void duplicateShortNameTest() { + parsingEngine.addArgumentSource( DuplicateShortNameProvider.class ); + } + + + private class DuplicateShortNameProvider { + @Argument(shortName="myarg", doc="my arg") + public Integer foo; + + @Argument(shortName="myarg", doc="my arg") + public Integer bar; + } + + @Test(expectedExceptions=UnmatchedArgumentException.class) + public void missingArgumentNameTest() { + final String[] commandLine = new String[] {"foo.txt"}; + + parsingEngine.addArgumentSource( NoArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + private class NoArgProvider { + + } + + @Test(expectedExceptions=UnmatchedArgumentException.class) + public void extraValueTest() { + final String[] commandLine = new String[] {"-I", "foo.txt", "bar.txt"}; + + parsingEngine.addArgumentSource( InputFileArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + @Test(expectedExceptions=MissingArgumentException.class) + public void multipleInvalidArgTest() { + final String[] commandLine = new String[] {"-N1", "-N2", "-N3"}; + + parsingEngine.addArgumentSource( RequiredArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + @Test(expectedExceptions=TooManyValuesForArgumentException.class) + public void invalidArgCountTest() { + final String[] commandLine = new String[] {"--value","1","--value","2","--value","3"}; + + parsingEngine.addArgumentSource( RequiredArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + @Test + public void packageProtectedArgTest() { + final String[] commandLine = new String[] {"--foo", "1"}; + + parsingEngine.addArgumentSource( PackageProtectedArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + PackageProtectedArgProvider argProvider = new PackageProtectedArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.foo.intValue(), 1, "Argument is not correctly initialized"); + } + + private class PackageProtectedArgProvider { + @Argument(doc="foo") + Integer foo; + } + + @Test + public void derivedArgTest() { + final String[] commandLine = new String[] {"--bar", "5"}; + + parsingEngine.addArgumentSource( DerivedArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + DerivedArgProvider argProvider = new DerivedArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.bar.intValue(), 5, "Argument is not correctly initialized"); + } + + private class DerivedArgProvider extends BaseArgProvider { + } + + private class BaseArgProvider { + @Argument(doc="bar") + public Integer bar; + } + + @Test + public void correctDefaultArgNameTest() { + parsingEngine.addArgumentSource( CamelCaseArgProvider.class ); + + DefinitionMatcher matcher = ArgumentDefinitions.FullNameDefinitionMatcher; + ArgumentDefinition definition = parsingEngine.argumentDefinitions.findArgumentDefinition("myarg", matcher); + + Assert.assertNotNull(definition, "Invalid default argument name assigned"); + } + + @SuppressWarnings("unused") + private class CamelCaseArgProvider { + @Argument(doc="my arg") + Integer myArg; + } + + @Test(expectedExceptions=UnmatchedArgumentException.class) + public void booleanWithParameterTest() { + final String[] commandLine = new String[] {"--mybool", "true"}; + + parsingEngine.addArgumentSource( BooleanArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + @SuppressWarnings("unused") + private class BooleanArgProvider { + @Argument(doc="my bool") + boolean myBool; + } + + @Test + public void validParseForAnalysisTypeTest() { + final String[] commandLine = new String[] {"--analysis_type", "Pileup" }; + + parsingEngine.addArgumentSource( AnalysisTypeArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate( EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument) ); + + AnalysisTypeArgProvider argProvider = new AnalysisTypeArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.Analysis_Name,"Pileup","Argument is not correctly initialized"); + } + + private class AnalysisTypeArgProvider { + @Argument(fullName="analysis_type", shortName="T", doc="Type of analysis to run") + public String Analysis_Name = null; + } + + @Test(expectedExceptions=TooManyValuesForArgumentException.class) + public void invalidParseForAnalysisTypeTest() { + final String[] commandLine = new String[] {"--analysis_type", "Pileup", "-T", "CountReads" }; + + parsingEngine.addArgumentSource( AnalysisTypeArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate( EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument) ); + } + + @Test(expectedExceptions=ArgumentsAreMutuallyExclusiveException.class) + public void mutuallyExclusiveArgumentsTest() { + // Passing only foo should work fine... + String[] commandLine = new String[] {"--foo","5"}; + + parsingEngine.addArgumentSource( MutuallyExclusiveArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + MutuallyExclusiveArgProvider argProvider = new MutuallyExclusiveArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.foo.intValue(), 5, "Argument is not correctly initialized"); + + // But when foo and bar come together, danger! + commandLine = new String[] {"--foo","5","--bar","6"}; + + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + @SuppressWarnings("unused") + private class MutuallyExclusiveArgProvider { + @Argument(doc="foo",exclusiveOf="bar") + Integer foo; + + @Argument(doc="bar",required=false) + Integer bar; + } + + @Test(expectedExceptions=InvalidArgumentValueException.class) + public void argumentValidationTest() { + // Passing only foo should work fine... + String[] commandLine = new String[] {"--value","521"}; + + parsingEngine.addArgumentSource( ValidatingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ValidatingArgProvider argProvider = new ValidatingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.value.intValue(), 521, "Argument is not correctly initialized"); + + // Try some invalid arguments + commandLine = new String[] {"--value","foo"}; + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + } + + private class ValidatingArgProvider { + @Argument(doc="value",validation="\\d+") + Integer value; + } + + @Test + public void argumentCollectionTest() { + String[] commandLine = new String[] { "--value", "5" }; + + parsingEngine.addArgumentSource( ArgumentCollectionProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ArgumentCollectionProvider argProvider = new ArgumentCollectionProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.rap.value.intValue(), 5, "Argument is not correctly initialized"); + } + + private class ArgumentCollectionProvider { + @ArgumentCollection + RequiredArgProvider rap = new RequiredArgProvider(); + } + + @Test(expectedExceptions= ReviewedStingException.class) + public void multipleArgumentCollectionTest() { + parsingEngine.addArgumentSource( MultipleArgumentCollectionProvider.class ); + } + + @SuppressWarnings("unused") + private class MultipleArgumentCollectionProvider { + @ArgumentCollection + RequiredArgProvider rap1 = new RequiredArgProvider(); + @ArgumentCollection + RequiredArgProvider rap2 = new RequiredArgProvider(); + } + + // -------------------------------------------------------------------------------- + // + // Tests of the RodBinding system + // + // -------------------------------------------------------------------------------- + + private class SingleRodBindingArgProvider { + @Input(fullName="binding", shortName="V", required=true) + public RodBinding binding; + } + + @Test + public void basicRodBindingArgumentTest() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + private class ShortNameOnlyRodBindingArgProvider { + @Input(shortName="short", required=false) + public RodBinding binding; // = RodBinding.makeUnbound(Feature.class); + } + + @Test + public void shortNameOnlyRodBindingArgumentTest() { + final String[] commandLine = new String[] {"-short:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( ShortNameOnlyRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ShortNameOnlyRodBindingArgProvider argProvider = new ShortNameOnlyRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + private class OptionalRodBindingArgProvider { + @Input(fullName="binding", shortName="V", required=false) + public RodBinding binding; + + @Input(fullName="bindingNull", shortName="VN", required=false) + public RodBinding bindingNull = null; + } + + @Test + public void optionalRodBindingArgumentTest() { + final String[] commandLine = new String[] {}; + + parsingEngine.addArgumentSource( OptionalRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + OptionalRodBindingArgProvider argProvider = new OptionalRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNotNull(argProvider.binding, "Default value not applied corrected to RodBinding"); + Assert.assertEquals(argProvider.binding.getName(), RodBinding.UNBOUND_VARIABLE_NAME, "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), RodBinding.UNBOUND_SOURCE, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.isBound(), false, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); + + Assert.assertNotNull(argProvider.bindingNull, "Default value not applied corrected to RodBinding"); + Assert.assertEquals(argProvider.bindingNull.getName(), RodBinding.UNBOUND_VARIABLE_NAME, "Name isn't set properly"); + Assert.assertEquals(argProvider.bindingNull.getSource(), RodBinding.UNBOUND_SOURCE, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.bindingNull.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.bindingNull.isBound(), false, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.bindingNull.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); + } + + @Test(expectedExceptions = UserException.class) + public void rodBindingArgumentTestMissingType() { + final String[] commandLine = new String[] {"-V",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + } + + @Test(expectedExceptions = UserException.class) + public void rodBindingArgumentTestTooManyTags() { + final String[] commandLine = new String[] {"-V:x,y,z",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + } + + private class VariantContextRodBindingArgProvider { + @Input(fullName = "binding", shortName="V") + public RodBinding binding; + } + + @Test + public void variantContextBindingArgumentTest() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + private class ListRodBindingArgProvider { + @Input(fullName = "binding", shortName="V", required=false) + public List> bindings; + } + + @Test + public void listRodBindingArgumentTest() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.bindings.size(), 1, "Unexpected number of bindings"); + RodBinding binding = argProvider.bindings.get(0); + Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test + public void listRodBindingArgumentTest2Args() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF, "-V:vcf", "bar.vcf"}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.bindings.size(), 2, "Unexpected number of bindings"); + + RodBinding binding = argProvider.bindings.get(0); + Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + + RodBinding binding2 = argProvider.bindings.get(1); + Assert.assertEquals(binding2.getName(), "binding2", "Name isn't set properly"); + Assert.assertEquals(binding2.getSource(), "bar.vcf", "Source isn't set to its expected value"); + Assert.assertEquals(binding2.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(binding2.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test + public void listRodBindingArgumentTest0Args() { + final String[] commandLine = new String[] {}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNull(argProvider.bindings, "Bindings were not null"); + } + + @Test + public void listRodBindingArgumentTestExplicitlyNamed() { + final String[] commandLine = new String[] {"-V:foo,vcf",NON_EXISTANT_FILENAME_VCF, "-V:foo,vcf", "bar.vcf"}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.bindings.size(), 2, "Unexpected number of bindings"); + Assert.assertEquals(argProvider.bindings.get(0).getName(), "foo", "Name isn't set properly"); + Assert.assertEquals(argProvider.bindings.get(1).getName(), "foo2", "Name isn't set properly"); + } + + private final static String HISEQ_VCF = privateTestDir + "HiSeq.10000.vcf"; + private final static String TRANCHES_FILE = privateTestDir + "tranches.6.txt"; + + @Test + public void variantContextBindingTestDynamicTyping1() { + final String[] commandLine = new String[] {"-V", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); + } + + @Test + public void variantContextBindingTestDynamicTypingNameAsSingleArgument() { + final String[] commandLine = new String[] {"-V:name", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test() + public void variantContextBindingTestDynamicTypingTwoTagsPassing() { + final String[] commandLine = new String[] {"-V:name,vcf", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 2, "Tags aren't correctly set"); + } + + @Test() + public void variantContextBindingTestDynamicTypingTwoTagsCausingTypeFailure() { + final String[] commandLine = new String[] {"-V:name,beagle", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTribbleType(), "beagle", "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 2, "Tags aren't correctly set"); + } + + @Test(expectedExceptions = UserException.class) + public void variantContextBindingTestDynamicTypingUnknownTribbleType() { + final String[] commandLine = new String[] {"-V", TRANCHES_FILE}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + } + + @Test + public void argumentListTest() throws IOException { + File argsFile = BaseTest.createTempFile("args.", ".list"); + try { + FileUtils.write(argsFile, "-I na12878.bam"); + final String[] commandLine = new String[] {"-args", argsFile.getPath()}; + parsingEngine.addArgumentSource(InputFileArgProvider.class); + parsingEngine.parse(commandLine); + parsingEngine.validate(); + + InputFileArgProvider argProvider = new InputFileArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.inputFile, "na12878.bam", "Argument is not correctly initialized"); + } finally { + FileUtils.deleteQuietly(argsFile); + } + } + + private class NumericRangeArgProvider { + @Argument(fullName = "intWithHardMinAndMax", minValue = 5, maxValue = 10) + public int intWithHardMinAndMax; + + @Argument(fullName = "intWithHardMin", minValue = 5) + public int intWithHardMin; + + @Argument(fullName = "intWithHardMax", maxValue = 10) + public int intWithHardMax; + + @Argument(fullName = "intWithSoftMinAndMax", minRecommendedValue = 5, maxRecommendedValue = 10) + public int intWithSoftMinAndMax; + + @Argument(fullName = "intWithSoftMin", minRecommendedValue = 5) + public int intWithSoftMin; + + @Argument(fullName = "intWithSoftMax", maxRecommendedValue = 10) + public int intWithSoftMax; + + @Argument(fullName = "intWithHardAndSoftMinAndMax", minValue = 5, minRecommendedValue = 7, maxValue = 10, maxRecommendedValue = 9) + public int intWithHardAndSoftMinAndMax; + + @Argument(fullName = "intWithHardAndSoftMin", minValue = 5, minRecommendedValue = 7) + public int intWithHardAndSoftMin; + + @Argument(fullName = "intWithHardAndSoftMax", maxValue = 10, maxRecommendedValue = 8) + public int intWithHardAndSoftMax; + + @Argument(fullName = "intWithHardMinAndMaxDefaultOutsideRange", minValue = 5, maxValue = 10) + public int intWithHardMinAndMaxDefaultOutsideRange = -1; + + @Argument(fullName = "integerWithHardMinAndMax", minValue = 5, maxValue = 10) + public Integer integerWithHardMinAndMax; + + @Argument(fullName = "byteWithHardMinAndMax", minValue = 5, maxValue = 10) + public byte byteWithHardMinAndMax; + + @Argument(fullName = "byteWithHardMin", minValue = 5) + public byte byteWithHardMin; + + @Argument(fullName = "byteWithHardMax", maxValue = 10) + public byte byteWithHardMax; + + @Argument(fullName = "doubleWithHardMinAndMax", minValue = 5.5, maxValue = 10.0) + public double doubleWithHardMinAndMax; + + @Argument(fullName = "doubleWithHardMin", minValue = 5.5) + public double doubleWithHardMin; + + @Argument(fullName = "doubleWithHardMax", maxValue = 10.0) + public double doubleWithHardMax; + } + + @DataProvider(name = "NumericRangeConstraintViolationDataProvider") + public Object[][] numericRangeConstraintViolationDataProvider() { + return new Object[][] { + { new String[]{"--intWithHardMinAndMax", "11"} }, + { new String[]{"--intWithHardMinAndMax", "4"} }, + { new String[]{"--intWithHardMin", "4"} }, + { new String[]{"--intWithHardMax", "11"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "11"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "4"} }, + { new String[]{"--intWithHardAndSoftMin", "4"} }, + { new String[]{"--intWithHardAndSoftMax", "11"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "11"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "4"} }, + { new String[]{"--integerWithHardMinAndMax", "11"} }, + { new String[]{"--integerWithHardMinAndMax", "4"} }, + { new String[]{"--byteWithHardMinAndMax", "11"} }, + { new String[]{"--byteWithHardMinAndMax", "4"} }, + { new String[]{"--byteWithHardMin", "4"} }, + { new String[]{"--byteWithHardMax", "11"} }, + { new String[]{"--doubleWithHardMinAndMax", "5.4"} }, + { new String[]{"--doubleWithHardMinAndMax", "10.1"} }, + { new String[]{"--doubleWithHardMin", "5.4"} }, + { new String[]{"--doubleWithHardMax", "10.1"} } + }; + } + + @Test(dataProvider = "NumericRangeConstraintViolationDataProvider", + expectedExceptions = ArgumentValueOutOfRangeException.class) + public void testNumericRangeWithConstraintViolation( final String[] commandLine ) { + runNumericArgumentRangeTest(commandLine); + } + + @DataProvider(name = "NumericRangeWithoutConstraintViolationDataProvider") + public Object[][] numericRangeWithoutConstraintViolationDataProvider() { + return new Object[][] { + { new String[]{"--intWithHardMinAndMax", "10"} }, + { new String[]{"--intWithHardMinAndMax", "5"} }, + { new String[]{"--intWithHardMinAndMax", "7"} }, + { new String[]{"--intWithHardMin", "11"} }, + { new String[]{"--intWithHardMax", "4"} }, + { new String[]{"--intWithSoftMinAndMax", "11"} }, + { new String[]{"--intWithSoftMinAndMax", "4"} }, + { new String[]{"--intWithSoftMin", "4"} }, + { new String[]{"--intWithSoftMax", "11"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "5"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "7"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "8"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "9"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "10"} }, + { new String[]{"--intWithHardAndSoftMin", "5"} }, + { new String[]{"--intWithHardAndSoftMin", "6"} }, + { new String[]{"--intWithHardAndSoftMin", "7"} }, + { new String[]{"--intWithHardAndSoftMax", "10"} }, + { new String[]{"--intWithHardAndSoftMax", "9"} }, + { new String[]{"--intWithHardAndSoftMax", "8"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "10"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "5"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "7"} }, + { new String[]{"--integerWithHardMinAndMax", "10"} }, + { new String[]{"--integerWithHardMinAndMax", "5"} }, + { new String[]{"--byteWithHardMinAndMax", "10"} }, + { new String[]{"--byteWithHardMinAndMax", "5"} }, + { new String[]{"--byteWithHardMinAndMax", "7"} }, + { new String[]{"--byteWithHardMin", "5"} }, + { new String[]{"--byteWithHardMax", "10"} }, + { new String[]{"--doubleWithHardMinAndMax", "5.5"} }, + { new String[]{"--doubleWithHardMinAndMax", "10.0"} }, + { new String[]{"--doubleWithHardMinAndMax", "7.5"} }, + { new String[]{"--doubleWithHardMin", "5.5"} }, + { new String[]{"--doubleWithHardMin", "15.5"} }, + { new String[]{"--doubleWithHardMax", "10.0"} }, + { new String[]{"--doubleWithHardMax", "7.5"} } + }; + } + + @Test(dataProvider = "NumericRangeWithoutConstraintViolationDataProvider") + public void testNumericRangeWithoutConstraintViolation( final String[] commandLine ) { + // These tests succeed if no exception is thrown, since no constraints have been violated + runNumericArgumentRangeTest(commandLine); + } + + private void runNumericArgumentRangeTest( final String[] commandLine ) { + parsingEngine.addArgumentSource(NumericRangeArgProvider.class); + parsingEngine.parse(commandLine); + + NumericRangeArgProvider argProvider = new NumericRangeArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + } +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java new file mode 100644 index 000000000..853c51543 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java @@ -0,0 +1,148 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Collection; + +public class RodBindingCollectionUnitTest extends BaseTest { + + private ParsingEngine parsingEngine; + private Tags mytags; + + private static final String defaultTagString = "VCF"; + private static final String testVCFFileName = privateTestDir + "empty.vcf"; + private static final String testListFileName = privateTestDir + "oneVCF.list"; + + @BeforeMethod + public void setUp() { + parsingEngine = new ParsingEngine(null); + RodBinding.resetNameCounter(); + mytags = new Tags(); + mytags.addPositionalTag(defaultTagString); + } + + private class RodBindingCollectionArgProvider { + @Argument(fullName="input",doc="input",shortName="V") + public RodBindingCollection input; + } + + @Test + public void testStandardVCF() { + final String[] commandLine = new String[] {"-V", testVCFFileName}; + + parsingEngine.addArgumentSource( RodBindingCollectionArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + final RodBindingCollectionArgProvider argProvider = new RodBindingCollectionArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.input.getRodBindings().iterator().next().getSource(), testVCFFileName, "Argument is not correctly initialized"); + } + + @Test + public void testList() { + final String[] commandLine = new String[] {"-V", testListFileName}; + + parsingEngine.addArgumentSource(RodBindingCollectionArgProvider.class); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + final RodBindingCollectionArgProvider argProvider = new RodBindingCollectionArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.input.getRodBindings().iterator().next().getSource(), "private/testdata/empty.vcf", "Argument is not correctly initialized"); + } + + @Test + public void testDefaultTagsInFile() throws IOException { + + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.defaultTags", ".list"); + testFile.deleteOnExit(); + final FileWriter writer = new FileWriter(testFile); + writer.write(testVCFFileName, 0, testVCFFileName.length()); + writer.close(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + + final Collection bindings = parsingEngine.getRodBindings(); + Assert.assertNotNull(bindings); + Assert.assertEquals(bindings.size(), 1); + + final RodBinding binding = bindings.iterator().next(); + Assert.assertEquals(parsingEngine.getTags(binding), mytags); + } + + @Test(expectedExceptions = UserException.BadArgumentValue.class) + public void testDuplicateEntriesInFile() throws IOException { + + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.variantListWithDuplicates", ".list"); + testFile.deleteOnExit(); + final FileWriter writer = new FileWriter(testFile); + writer.write(testVCFFileName + "\n"); + writer.write(testVCFFileName + "\n"); + writer.close(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + } + + @Test(expectedExceptions = UserException.BadArgumentValue.class) + public void testValidateEmptyFile() throws IOException { + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.emptyVCFList", ".list"); + testFile.deleteOnExit(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + } + + @Test + public void testOverrideTagsInFile() throws IOException { + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.overrideTags", ".list"); + testFile.deleteOnExit(); + final FileWriter writer = new FileWriter(testFile); + final String textToWrite = "foo " + testVCFFileName; + writer.write(textToWrite, 0, textToWrite.length()); + writer.close(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + + final Collection bindings = parsingEngine.getRodBindings(); + Assert.assertNotNull(bindings); + Assert.assertEquals(bindings.size(), 1); + + final RodBinding binding = bindings.iterator().next(); + Assert.assertNotEquals(parsingEngine.getTags(binding), mytags); + } +} diff --git a/public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java new file mode 100644 index 000000000..b10043340 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -0,0 +1,267 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.ArgumentException; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.qc.CountReads; +import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * Tests selected functionality in the GenomeAnalysisEngine class + */ +public class GenomeAnalysisEngineUnitTest extends BaseTest { + + @Test(expectedExceptions=UserException.class) + public void testEmptySamFileListHandling() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + testEngine.setWalker(new CountReads()); //generalizable to any walker requiring reads + + //supply command line args so validateSuppliedReads() knows whether reads were passed in + GATKArgumentCollection testArgs = new GATKArgumentCollection(); + testArgs.samFiles.add("empty.list"); + testEngine.setArguments(testArgs); + + //represents the empty list of samFiles read in from empty.list by CommandLineExecutable + Collection samFiles = new ArrayList(); + + testEngine.setSAMFileIDs(samFiles); + testEngine.validateSuppliedReads(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + final File relativePathToBAMFile = new File("public/testdata/exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); + samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test + public void testEmptyIntervalSetHandling() throws Exception { + GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); + + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + testEngine.setWalker(new PrintReads()); + testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); + + testEngine.validateSuppliedIntervals(); + } + + @Test + public void testLoadWellFormedSampleRenameMapFile() throws IOException { + final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", + "/foo/bar/second.bam newSample2", + "/foo/bar2/third.bam newSample3")); + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + + Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); + + final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); + while ( expectedResultsIterator.hasNext() ) { + final String expectedKey = expectedResultsIterator.next(); + final String expectedValue = expectedResultsIterator.next(); + + Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey)); + Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey); + } + } + + @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") + public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { + final List tests = new ArrayList(); + + tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", + new File("/foo/bar/nonexistent")}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", + createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", + createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", + "/path/to/dupe.bam newSample2"))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) + public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { + logger.info("Executing test " + testName); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + /////////////////////////////////////////////////// + // Test the ReadTransformer ordering enforcement // + /////////////////////////////////////////////////// + + public static class TestReadTransformer extends ReadTransformer { + + private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; + private boolean enabled; + + protected TestReadTransformer(final OrderingConstraint orderingConstraint) { + this.orderingConstraint = orderingConstraint; + enabled = true; + } + + // need this because PackageUtils will pick up this class as a possible ReadTransformer + protected TestReadTransformer() { + enabled = false; + } + + @Override + public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } + + @Override + public boolean enabled() { return enabled; } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } + + } + + @DataProvider(name = "ReadTransformerData") + public Object[][] makeReadTransformerData() { + List tests = new ArrayList(); + + for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { + tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadTransformerData") + public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { + + final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + final List readTransformers = new ArrayList(3); + readTransformers.add(new TestReadTransformer(oc1)); + readTransformers.add(new TestReadTransformer(oc2)); + readTransformers.add(new TestReadTransformer(oc3)); + + final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || + numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; + + try { + testEngine.setReadTransformers(readTransformers); + + Assert.assertFalse(shouldThrowException); + Assert.assertEquals(testEngine.getReadTransformers().size(), 3); + + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + } catch (UserException.IncompatibleReadFiltersException e) { + Assert.assertTrue(shouldThrowException); + } + } + + private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { + int count = 0; + for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { + if ( constraint == target ) + count++; + } + return count; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java new file mode 100644 index 000000000..0c53de307 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,230 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.testng.Assert.*; + +/** + *

+ * Class SAMDataSourceUnitTest + *

+ * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource + + private List readers; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + StingSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } + + /** Test that we clear program records when requested */ + @Test + public void testRemoveProgramRecords() { + logger.warn("Executing testRemoveProgramRecords"); + + // setup the data + readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags())); + + // use defaults + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + List defaultProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor"); + + boolean removeProgramRecords = false; + data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + removeProgramRecords, + false, + null); + + List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); + + removeProgramRecords = true; + data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + removeProgramRecords, + false, + null); + + List doRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); + } + + @Test(expectedExceptions = UserException.class) + public void testFailOnReducedReads() { + readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); + + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100644 index 000000000..c587d5e08 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,219 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(results, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } + + @DataProvider(name = "BiasedDownsamplingTest") + public Object[][] makeBiasedDownsamplingTest() { + final List tests = new LinkedList(); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + for ( final int originalCount : Arrays.asList(1, 2, 10, 1000) ) { + for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { + if ( toRemove <= originalCount ) + tests.add(new Object[]{header, originalCount, toRemove}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BiasedDownsamplingTest") + public void testBiasedDownsampling(final SAMFileHeader header, final int originalCount, final int toRemove) { + + final LinkedList elements = new LinkedList<>(); + for ( int i = 0; i < originalCount; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + + final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalCount, toRemove); + + Assert.assertEquals(result.size(), toRemove); + } + + @Test + public void testLoadContaminationFileDetails(){ + Logger logger=org.apache.log4j.Logger.getRootLogger(); + + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); + + Map Contam1=new HashMap(); + Set Samples1=new HashSet(); + + Contam1.put("NA11918",0.15); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Contam1.put("NA12842",0.13); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Samples1.add("DUMMY"); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + } + + private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ + Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); + Assert.assertTrue(loadedMap.equals(map)); + } + + @DataProvider(name = "goodContaminationFiles") + public Integer[][] goodContaminationFiles() { + return new Integer[][]{ + {1, 2}, + {2, 3}, + {3, 2}, + {4, 2}, + {5, 3}, + {6, 2}, + {7, 2}, + {8, 2} + }; + } + + @Test(dataProvider = "goodContaminationFiles") + public void testLoadContaminationFile(final Integer ArtificalBAMnumber, final Integer numberOfSamples) { + final String ArtificialBAM = String.format("ArtificallyContaminatedBams/contamination.case.%d.txt", ArtificalBAMnumber); + Logger logger = org.apache.log4j.Logger.getRootLogger(); + + File ContamFile = new File(privateTestDir, ArtificialBAM); + Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile, 0.0, null, logger).size() == numberOfSamples); + + } + + + @DataProvider(name = "badContaminationFiles") + public Integer[][] badContaminationFiles() { + return new Integer[][]{{1}, {2}, {3}, {4}, {5}}; + } + + @Test(dataProvider = "badContaminationFiles", expectedExceptions = UserException.MalformedFile.class) + public void testLoadBrokenContaminationFile(final int i) { + Logger logger = org.apache.log4j.Logger.getRootLogger(); + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + + File ContaminationFile = new File(ArtificalBAMLocation + String.format("contamination.case.broken.%d.txt", i)); + AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile, 0.0, null, logger); + + } + + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java new file mode 100644 index 000000000..4fd9e491c --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -0,0 +1,158 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +public class FractionalDownsamplerUnitTest extends BaseTest { + + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; + + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent + + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); + + this.fraction = fraction; + this.totalReads = totalReads; + + calculateExpectations(); + + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); + } + + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..07a8a7975 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,163 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.*; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + final int sizeFromDownsampler = downsampler.size(); + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..66abfd29b --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,131 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..afe8729c2 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,331 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + GenomeAnalysisEngine.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java new file mode 100644 index 000000000..62f4bdc88 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TAROrderedReadCacheUnitTest extends BaseTest { + // example fasta index file, can be deleted if you don't use the reference + private IndexedFastaSequenceFile seq; + + @BeforeClass + public void setup() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + } + + @DataProvider(name = "ReadCacheTestData") + public Object[][] makeReadCacheTestData() { + List tests = new ArrayList(); + + for ( final int nReadsPerLocus : Arrays.asList(0, 1, 10, 100) ) { + for ( final int nLoci : Arrays.asList(1, 10, 100) ) { + for ( final int max : Arrays.asList(10, 50, 1000) ) { + for ( final boolean addAllAtOnce : Arrays.asList(true, false) ) { + tests.add(new Object[]{nReadsPerLocus, nLoci, max, addAllAtOnce}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadCacheTestData") + public void testReadCache(final int nReadsPerLocus, final int nLoci, final int max, final boolean addAllAtOnce) { + final TAROrderedReadCache cache = new TAROrderedReadCache(max); + + Assert.assertEquals(cache.getMaxCapacity(), max); + Assert.assertEquals(cache.getNumDiscarded(), 0); + Assert.assertEquals(cache.size(), 0); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(seq, nReadsPerLocus, nLoci); + final List reads = bamBuilder.makeReads(); + + if ( addAllAtOnce ) { + cache.addAll(reads); + } else { + for ( final GATKSAMRecord read : reads ) { + cache.add(read); + } + } + + final int nTotalReads = reads.size(); + final int nExpectedToKeep = Math.min(nTotalReads, max); + final int nExpectedToDiscard = nTotalReads - nExpectedToKeep; + Assert.assertEquals(cache.getNumDiscarded(), nExpectedToDiscard, "wrong number of reads discarded"); + Assert.assertEquals(cache.size(), nExpectedToKeep, "wrong number of reads kept"); + + final List cacheReads = cache.popCurrentReads(); + Assert.assertEquals(cache.size(), 0, "Should be no reads left"); + Assert.assertEquals(cache.getNumDiscarded(), 0, "should have reset stats"); + Assert.assertEquals(cacheReads.size(), nExpectedToKeep, "should have 1 read for every read we expected to keep"); + + verifySortednessOfReads(cacheReads); + } + + private void verifySortednessOfReads( final List reads) { + int lastStart = -1; + for ( GATKSAMRecord read : reads ) { + Assert.assertTrue(lastStart <= read.getAlignmentStart(), "Reads should be sorted but weren't. Found read with start " + read.getAlignmentStart() + " while last was " + lastStart); + lastStart = read.getAlignmentStart(); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java new file mode 100644 index 000000000..5392e8037 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.coverage; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class CallableLociIntegrationTest extends WalkerTest { + final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; + final static String reduceReadArgs = "-R " + b37KGReference + " -T CallableLoci -I " + " private/testdata/NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s"; + + final static String SUMMARY_MD5 = "a6f5963669f19d9d137ced87d65834b0"; + + @Test + public void testCallableLociWalkerBed() { + String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, + Arrays.asList("9b4ffea1dbcfefadeb1c9fa74b0e0e59", SUMMARY_MD5)); + executeTest("formatBed", spec); + } + + @Test + public void testCallableLociWalkerPerBase() { + String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, + Arrays.asList("d6505e489899e80c08a7168777f6e07b", SUMMARY_MD5)); + executeTest("format_state_per_base", spec); + } + + @Test + public void testCallableLociWalker2() { + String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, + Arrays.asList("330f476085533db92a9dbdb3a127c041", "d287510eac04acf5a56f5cde2cba0e4a")); + executeTest("formatBed by interval", spec); + } + + @Test + public void testCallableLociWalker3() { + String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, + Arrays.asList("7f79ad8195c4161060463eeb21d2bb11", "7ee269e5f4581a924529a356cc806e55")); + executeTest("formatBed lots of arguments", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java new file mode 100644 index 000000000..0971cb90b --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.testng.annotations.Test; +import org.broadinstitute.sting.WalkerTest; + +import java.util.Collections; + +/** + * Run validating pileup across a set of core data as proof of the integrity of the GATK core. + * + * Tests both types of old-school pileup formats (basic and consensus). + * + * @author mhanna, vdauwera + * @version 0.2 + */ +public class CheckPileupIntegrationTest extends WalkerTest { + /** + * This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels + */ + @Test(enabled = true) + public void testEcoliConsensusPileup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CheckPileup" + + " -I " + validationDataLocation + "MV1994.selected.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + + " -S SILENT -nt 8",0, Collections.emptyList()); + executeTest("testEcoliConsensusPileup",spec); + } + + /** + * This test runs on a basic pileup containing 6-column lines for all variants TODO + */ + @Test + public void testEcoliBasicPileup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CheckPileup" + + " -I " + validationDataLocation + "MV1994.selected.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" + + " -L Escherichia_coli_K12:1-49" + + " -S SILENT -nt 8",0, Collections.emptyList()); + executeTest("testEcoliBasicPileup",spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java new file mode 100644 index 000000000..83d571748 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + + +public class FilterLiftedVariantsUnitTest extends BaseTest { + + @Test + public void testIndelAtEndOfContig() { + + final List alleles = new ArrayList<>(2); + alleles.add(Allele.create("AAAAA", true)); + alleles.add(Allele.create("A", false)); + final VariantContext vc = new VariantContextBuilder("test", "1", 10, 14, alleles).make(); + + final FilterLiftedVariants filter = new FilterLiftedVariants(); + + Assert.assertFalse(filter.filterOrWrite(new byte[]{'A'}, vc)); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MWUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MWUnitTest.java new file mode 100644 index 000000000..312e4d5b1 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MWUnitTest.java @@ -0,0 +1,131 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.collections.Pair; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; +import org.testng.Assert; + +/** + * Created by IntelliJ IDEA. + * User: Ghost + * Date: 3/5/11 + * Time: 2:06 PM + * To change this template use File | Settings | File Templates. + */ +public class MWUnitTest extends BaseTest { + @BeforeClass + public void init() { } + + @Test + private void testMWU() { + logger.warn("Testing MWU"); + MannWhitneyU mwu = new MannWhitneyU(); + mwu.add(0, MannWhitneyU.USet.SET1); + mwu.add(1,MannWhitneyU.USet.SET2); + mwu.add(2,MannWhitneyU.USet.SET2); + mwu.add(3,MannWhitneyU.USet.SET2); + mwu.add(4,MannWhitneyU.USet.SET2); + mwu.add(5,MannWhitneyU.USet.SET2); + mwu.add(6,MannWhitneyU.USet.SET1); + mwu.add(7,MannWhitneyU.USet.SET1); + mwu.add(8,MannWhitneyU.USet.SET1); + mwu.add(9,MannWhitneyU.USet.SET1); + mwu.add(10,MannWhitneyU.USet.SET1); + mwu.add(11,MannWhitneyU.USet.SET2); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); + + MannWhitneyU mwu2 = new MannWhitneyU(); + MannWhitneyU mwuNoDither = new MannWhitneyU(false); + for ( int dp : new int[]{2,4,5,6,8} ) { + mwu2.add(dp,MannWhitneyU.USet.SET1); + mwuNoDither.add(dp,MannWhitneyU.USet.SET1); + } + + for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { + mwu2.add(dp,MannWhitneyU.USet.SET2); + mwuNoDither.add(dp,MannWhitneyU.USet.SET2); + } + + MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; + MannWhitneyU.ExactMode cm = MannWhitneyU.ExactMode.CUMULATIVE; + + // tests using the hypothesis that set 2 dominates set 1 (U value = 10) + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); + + Pair sizes = mwu2.getSetSizes(); + + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10L),0.4180519701814064,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10L,false,pm).second,0.021756021756021756,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10L,false).second,0.06214143703127617,1e-14); + logger.warn("Testing two-sided"); + Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8); + + // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30L,true).second,2.0*0.08216463976903321,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30L),0.0023473625009559074,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30L,false,pm).second,0.021756021756021756,1e-14); // note -- exactly same value as above + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).second,1.0-0.08547008547008,1e-14); // r does a correction, subtracting 1 from U + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).second,0.08547008547008,1e-14); // r does a correction, subtracting 1 from U + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).first,-1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).first,1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,pm).first,1.2558754796642067,1e-8); // PDF should be similar + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,pm).first,-1.2558754796642067,1e-8); // PDF should be similar + Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).second,0.0952381,1e-5); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).first,0.0,1e-14); + + logger.warn("Set 1"); + Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8); + logger.warn("Set 2"); + Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8); + + MannWhitneyU mwu3 = new MannWhitneyU(); + for ( int dp : new int[]{0,2,4} ) { + mwu3.add(dp,MannWhitneyU.USet.SET1); + } + for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) { + mwu3.add(dp,MannWhitneyU.USet.SET2); + } + long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1); + //logger.warn(String.format("U is: %d",u)); + Pair nums = mwu3.getSetSizes(); + //logger.warn(String.format("Corrected p is: %.4e",MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u))); + //logger.warn(String.format("Counted sequences: %d",MannWhitneyU.countSequences(nums.first, nums.second, u))); + //logger.warn(String.format("Possible sequences: %d", (long) Arithmetic.binomial(nums.first+nums.second,nums.first))); + //logger.warn(String.format("Ratio: %.4e",MannWhitneyU.countSequences(nums.first,nums.second,u)/Arithmetic.binomial(nums.first+nums.second,nums.first))); + Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first, nums.second, u), 3.665689149560116E-4, 1e-14); + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u,false).second,0.0032240865760884696,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14); + + } +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MathUtilsUnitTest.java new file mode 100644 index 000000000..1bcf38d10 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -0,0 +1,913 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import cern.jet.random.Normal; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for MathUtils + */ +public class MathUtilsUnitTest extends BaseTest { + + @BeforeClass + public void init() { + } + + /** + * Tests that we get unique values for the valid (non-null-producing) input space for {@link MathUtils#fastGenerateUniqueHashFromThreeIntegers(int, int, int)}. + */ + @Test + public void testGenerateUniqueHashFromThreePositiveIntegers() { + logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers"); + + final Set observedLongs = new HashSet<>(); + for (short i = 0; i < Byte.MAX_VALUE; i++) { + for (short j = 0; j < Byte.MAX_VALUE; j++) { + for (short k = 0; k < Byte.MAX_VALUE; k++) { + final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); + //System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); + Assert.assertTrue(observedLongs.add(aLong)); + } + } + } + + for (short i = Byte.MAX_VALUE; i <= Short.MAX_VALUE && i > 0; i += 128) { + for (short j = Byte.MAX_VALUE; j <= Short.MAX_VALUE && j > 0; j += 128) { + for (short k = Byte.MAX_VALUE; k <= Short.MAX_VALUE && k > 0; k += 128) { + final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); + // System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); + Assert.assertTrue(observedLongs.add(aLong)); + } + } + } + } + + @Test(dataProvider = "log10OneMinusPow10Data") + public void testLog10OneMinusPow10(final double x, final double expected) { + final double actual = MathUtils.log10OneMinusPow10(x); + if (Double.isNaN(expected)) + Assert.assertTrue(Double.isNaN(actual)); + else + Assert.assertEquals(actual,expected,1E-9); + } + + @Test(dataProvider = "log1mexpData") + public void testLog1mexp(final double x, final double expected) { + final double actual = MathUtils.log1mexp(x); + if (Double.isNaN(expected)) + Assert.assertTrue(Double.isNaN(actual)); + else + Assert.assertEquals(actual,expected,1E-9); + } + + @DataProvider(name = "log10OneMinusPow10Data") + public Iterator log10OneMinusPow10Data() { + + final double[] inValues = new double[] { Double.NaN, 10, 1, 0, -1, -3, -10, -30, -100, -300, -1000, -3000 }; + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < inValues.length; + + } + + @Override + public Object[] next() { + final double input = inValues[i++]; + final double output = Math.log10( 1 - Math.pow(10,input)); + return new Object[] { input, output }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name = "log1mexpData") + public Iterator log1mexpData() { + + final double[] inValues = new double[] { Double.NaN, 10, 1, 0, -1, -3, -10, -30, -100, -300, -1000, -3000 }; + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < inValues.length; + + } + + @Override + public Object[] next() { + final double input = inValues[i++]; + final double output = Math.log( 1 - Math.exp(input)); + return new Object[] { input, output }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + /** + * Tests that we get the right values from the binomial distribution + */ + @Test + public void testBinomialProbability() { + logger.warn("Executing testBinomialProbability"); + + Assert.assertEquals(MathUtils.binomialProbability(3, 2, 0.5), 0.375, 0.0001); + Assert.assertEquals(MathUtils.binomialProbability(100, 10, 0.5), 1.365543e-17, 1e-18); + Assert.assertEquals(MathUtils.binomialProbability(217, 73, 0.02), 4.521904e-67, 1e-68); + Assert.assertEquals(MathUtils.binomialProbability(300, 100, 0.02), 9.27097e-91, 1e-92); + Assert.assertEquals(MathUtils.binomialProbability(300, 150, 0.98), 6.462892e-168, 1e-169); + Assert.assertEquals(MathUtils.binomialProbability(300, 120, 0.98), 3.090054e-221, 1e-222); + Assert.assertEquals(MathUtils.binomialProbability(300, 112, 0.98), 2.34763e-236, 1e-237); + } + + /** + * Tests that we get the right values from the binomial distribution + */ + @Test + public void testCumulativeBinomialProbability() { + logger.warn("Executing testCumulativeBinomialProbability"); + + for (int j = 0; j < 2; j++) { // Test memoizing functionality, as well. + final int numTrials = 10; + for ( int i = 0; i < numTrials; i++ ) + Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); + + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); + } + } + + /** + * Tests that we get the right values from the multinomial distribution + */ + @Test + public void testMultinomialProbability() { + logger.warn("Executing testMultinomialProbability"); + + int[] counts0 = {2, 0, 1}; + double[] probs0 = {0.33, 0.33, 0.34}; + Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); + + int[] counts1 = {10, 20, 30}; + double[] probs1 = {0.25, 0.25, 0.50}; + Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); + + int[] counts2 = {38, 82, 50, 36}; + double[] probs2 = {0.25, 0.25, 0.25, 0.25}; + Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); + + int[] counts3 = {1, 600, 1}; + double[] probs3 = {0.33, 0.33, 0.34}; + Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); + } + + /** + * Tests that the random index selection is working correctly + */ + @Test + public void testRandomIndicesWithReplacement() { + logger.warn("Executing testRandomIndicesWithReplacement"); + + // Check that the size of the list returned is correct + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 0).size() == 0); + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1).size() == 1); + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 5).size() == 5); + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1000).size() == 1000); + + // Check that the list contains only the k element range that as asked for - no more, no less + List Five = new ArrayList<>(); + Collections.addAll(Five, 0, 1, 2, 3, 4); + List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); + Assert.assertTrue(BigFive.containsAll(Five)); + Assert.assertTrue(Five.containsAll(BigFive)); + } + + /** + * Tests that we get the right values from the multinomial distribution + */ + @Test + public void testSliceListByIndices() { + logger.warn("Executing testSliceListByIndices"); + + // Check that the list contains only the k element range that as asked for - no more, no less but now + // use the index list to pull elements from another list using sliceListByIndices + List Five = new ArrayList<>(); + Collections.addAll(Five, 0, 1, 2, 3, 4); + List FiveAlpha = new ArrayList<>(); + Collections.addAll(FiveAlpha, 'a', 'b', 'c', 'd', 'e'); + List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); + List BigFiveAlpha = MathUtils.sliceListByIndices(BigFive, FiveAlpha); + Assert.assertTrue(BigFiveAlpha.containsAll(FiveAlpha)); + Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); + } + + /** + * Tests that we correctly compute mean and standard deviation from a stream of numbers + */ + @Test + public void testRunningAverage() { + logger.warn("Executing testRunningAverage"); + + int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; + MathUtils.RunningAverage r = new MathUtils.RunningAverage(); + + for (final double b : numbers) + r.add(b); + + Assert.assertEquals((long) numbers.length, r.observationCount()); + Assert.assertTrue(r.mean() - 3224.625 < 2e-10); + Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); + } + + @Test + public void testLog10Gamma() { + logger.warn("Executing testLog10Gamma"); + + Assert.assertEquals(MathUtils.log10Gamma(4.0), 0.7781513, 1e-6); + Assert.assertEquals(MathUtils.log10Gamma(10), 5.559763, 1e-6); + Assert.assertEquals(MathUtils.log10Gamma(10654), 38280.53, 1e-2); + } + + @Test + public void testLog10BinomialCoefficient() { + logger.warn("Executing testLog10BinomialCoefficient"); + // note that we can test the binomial coefficient calculation indirectly via Newton's identity + // (1+z)^m = sum (m choose k)z^k + double[] z_vals = new double[]{0.999,0.9,0.8,0.5,0.2,0.01,0.0001}; + int[] exponent = new int[]{5,15,25,50,100}; + for ( double z : z_vals ) { + double logz = Math.log10(z); + for ( int exp : exponent ) { + double expected_log = exp*Math.log10(1+z); + double[] newtonArray_log = new double[1+exp]; + for ( int k = 0 ; k <= exp; k++ ) { + newtonArray_log[k] = MathUtils.log10BinomialCoefficient(exp,k)+k*logz; + } + Assert.assertEquals(MathUtils.log10sumLog10(newtonArray_log),expected_log,1e-6); + } + } + + Assert.assertEquals(MathUtils.log10BinomialCoefficient(4, 2), 0.7781513, 1e-6); + Assert.assertEquals(MathUtils.log10BinomialCoefficient(10, 3), 2.079181, 1e-6); + Assert.assertEquals(MathUtils.log10BinomialCoefficient(103928, 119), 400.2156, 1e-4); + } + + @Test + public void testFactorial() { + logger.warn("Executing testFactorial"); + Assert.assertEquals((int) MathUtils.factorial(4), 24); + Assert.assertEquals((int) MathUtils.factorial(10), 3628800); + Assert.assertEquals((int) MathUtils.factorial(12), 479001600); + } + + @Test + public void testLog10Factorial() { + logger.warn("Executing testLog10Factorial"); + Assert.assertEquals(MathUtils.log10Factorial(4), 1.380211, 1e-6); + Assert.assertEquals(MathUtils.log10Factorial(10), 6.559763, 1e-6); + Assert.assertEquals(MathUtils.log10Factorial(12), 8.680337, 1e-6); + Assert.assertEquals(MathUtils.log10Factorial(200), 374.8969, 1e-3); + Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); + double log10factorial_small = 0; + double log10factorial_middle = 374.8969; + double log10factorial_large = 45138.26; + int small_start = 1; + int med_start = 200; + int large_start = 12342; + for ( int i = 1; i < 1000; i++ ) { + log10factorial_small += Math.log10(i+small_start); + log10factorial_middle += Math.log10(i+med_start); + log10factorial_large += Math.log10(i+large_start); + Assert.assertEquals(MathUtils.log10Factorial(small_start+i),log10factorial_small,1e-6); + Assert.assertEquals(MathUtils.log10Factorial(med_start+i),log10factorial_middle,1e-3); + Assert.assertEquals(MathUtils.log10Factorial(large_start+i),log10factorial_large,1e-1); + } + } + + @Test + public void testApproximateLog10SumLog10() { + + final double requiredPrecision = 1E-4; + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); + + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.approximateLog10SumLog10(equal),0.0,requiredPrecision,String.format("Did not sum to one: k=%d equal partitions.",npart)); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(multiplicative),0.0,requiredPrecision, String.format("Did not sum to one: k=%d multiplicative partitions with alpha=%f",npart,alpha)); + } + } + } + + @Test + public void testLog10sumLog10() { + final double requiredPrecision = 1E-14; + + final double log3 = 0.477121254719662; + Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3, requiredPrecision); + + final double log2 = 0.301029995663981; + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0, requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0}), 0.0, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-5.15}), -5.15, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {130.0}), 130.0, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.145}), -0.145, requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); + + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.log10sumLog10(equal),0.0,requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(multiplicative),0.0,requiredPrecision,String.format("Did not sum to one: nPartitions=%d, alpha=%f",npart,alpha)); + } + } + } + + @Test + public void testLogDotProduct() { + Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); + Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); + } + + @Test + public void testNormalDistribution() { + final double requiredPrecision = 1E-10; + + final Normal n = new Normal(0.0, 1.0, null); + for( final double mu : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { + for( final double sigma : new double[]{1.2, 3.0, 5.8977} ) { + for( final double x : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { + n.setState(mu, sigma); + Assert.assertEquals(n.pdf(x), MathUtils.normalDistribution(mu, sigma, x), requiredPrecision); + Assert.assertEquals(Math.log10(n.pdf(x)), MathUtils.normalDistributionLog10(mu, sigma, x), requiredPrecision); + } + } + } + } + + @DataProvider(name = "ArrayMinData") + public Object[][] makeArrayMinData() { + List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList(10), 10}); + tests.add(new Object[]{Arrays.asList(-10), -10}); + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,3), 3, false) ) { + tests.add(new Object[]{values, 1}); + } + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { + tests.add(new Object[]{values, -3}); + } + + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinList(final List values, final int expected) { + final int actual = MathUtils.arrayMin(values); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinIntArray(final List values, final int expected) { + final int[] asArray = ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])); + final int actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinByteArray(final List values, final int expected) { + final byte[] asArray = new byte[values.size()]; + for ( int i = 0; i < values.size(); i++ ) asArray[i] = (byte)(values.get(i) & 0xFF); + final byte actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, (byte)(expected & 0xFF), "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinDoubleArray(final List values, final int expected) { + final double[] asArray = new double[values.size()]; + for ( int i = 0; i < values.size(); i++ ) asArray[i] = (double)(values.get(i)); + final double actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, (double)expected, "Failed with " + values); + } + + @DataProvider(name = "MedianData") + public Object[][] makeMedianData() { + final List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList(10), 10}); + tests.add(new Object[]{Arrays.asList(1, 10), 10}); + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { + tests.add(new Object[]{values, 1}); + } + + for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { + tests.add(new Object[]{values, 1.1}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MedianData") + public void testMedian(final List values, final Comparable expected) { + final Comparable actual = MathUtils.median(values); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + + + // man. All this to test dirichlet. + + private double[] unwrap(List stuff) { + double[] unwrapped = new double[stuff.size()]; + int idx = 0; + for ( Double d : stuff ) { + unwrapped[idx++] = d == null ? 0.0 : d; + } + + return unwrapped; + } + + /** + * The PartitionGenerator generates all of the partitions of a number n, e.g. + * 5 + 0 + * 4 + 1 + * 3 + 2 + * 3 + 1 + 1 + * 2 + 2 + 1 + * 2 + 1 + 1 + 1 + * 1 + 1 + 1 + 1 + 1 + * + * This is used to help enumerate the state space over which the Dirichlet-Multinomial is defined, + * to ensure that the distribution function is properly implemented + */ + class PartitionGenerator implements Iterator> { + // generate the partitions of an integer, each partition sorted numerically + int n; + List a; + + int y; + int k; + int state; + + int x; + int l; + + public PartitionGenerator(int n) { + this.n = n; + this.y = n - 1; + this.k = 1; + this.a = new ArrayList<>(); + for ( int i = 0; i < n; i++ ) { + this.a.add(i); + } + this.state = 0; + } + + public void remove() { /* do nothing */ } + + public boolean hasNext() { return ! ( this.k == 0 && state == 0 ); } + + private String dataStr() { + return String.format("a = [%s] k = %d y = %d state = %d x = %d l = %d", + Utils.join(",",a), k, y, state, x, l); + } + + public List next() { + if ( this.state == 0 ) { + this.x = a.get(k-1)+1; + k -= 1; + this.state = 1; + } + + if ( this.state == 1 ) { + while ( 2 * x <= y ) { + this.a.set(k,x); + this.y -= (int) x; + this.k++; + } + this.l = 1+this.k; + this.state = 2; + } + + if ( this.state == 2 ) { + if ( x <= y ) { + this.a.set(k,x); + this.a.set(l,y); + x += 1; + y -= 1; + return this.a.subList(0, this.k + 2); + } else { + this.state =3; + } + } + + if ( this.state == 3 ) { + this.a.set(k,x+y); + this.y = x + y - 1; + this.state = 0; + return a.subList(0, k + 1); + } + + throw new IllegalStateException("Cannot get here"); + } + + public String toString() { + final StringBuilder buf = new StringBuilder(); + buf.append("{ "); + while ( hasNext() ) { + buf.append("["); + buf.append(Utils.join(",",next())); + buf.append("],"); + } + buf.deleteCharAt(buf.lastIndexOf(",")); + buf.append(" }"); + return buf.toString(); + } + + } + + /** + * NextCounts is the enumerator over the state space of the multinomial dirichlet. + * + * It filters the partition of the total sum to only those with a number of terms + * equal to the number of categories. + * + * It then generates all permutations of that partition. + * + * In so doing it enumerates over the full state space. + */ + class NextCounts implements Iterator { + + private PartitionGenerator partitioner; + private int numCategories; + private int[] next; + + public NextCounts(int numCategories, int totalCounts) { + partitioner = new PartitionGenerator(totalCounts); + this.numCategories = numCategories; + next = nextFromPartitioner(); + } + + public void remove() { /* do nothing */ } + + public boolean hasNext() { return next != null; } + + public int[] next() { + int[] toReturn = clone(next); + next = nextPermutation(); + if ( next == null ) { + next = nextFromPartitioner(); + } + + return toReturn; + } + + private int[] clone(int[] arr) { + return Arrays.copyOf(arr, arr.length); + } + + private int[] nextFromPartitioner() { + if ( partitioner.hasNext() ) { + List nxt = partitioner.next(); + while ( partitioner.hasNext() && nxt.size() > numCategories ) { + nxt = partitioner.next(); + } + + if ( nxt.size() > numCategories ) { + return null; + } else { + int[] buf = new int[numCategories]; + for ( int idx = 0; idx < nxt.size(); idx++ ) { + buf[idx] = nxt.get(idx); + } + Arrays.sort(buf); + return buf; + } + } + + return null; + } + + public int[] nextPermutation() { + return MathUtilsUnitTest.nextPermutation(next); + } + + } + + public static int[] nextPermutation(int[] next) { + // the counts can swap among each other. The int[] is originally in ascending order + // this generates the next array in lexicographic order descending + + // locate the last occurrence where next[k] < next[k+1] + int gt = -1; + for ( int idx = 0; idx < next.length-1; idx++) { + if ( next[idx] < next[idx+1] ) { + gt = idx; + } + } + + if ( gt == -1 ) { + return null; + } + + int largestLessThan = gt+1; + for ( int idx = 1 + largestLessThan; idx < next.length; idx++) { + if ( next[gt] < next[idx] ) { + largestLessThan = idx; + } + } + + int val = next[gt]; + next[gt] = next[largestLessThan]; + next[largestLessThan] = val; + + // reverse the tail of the array + int[] newTail = new int[next.length-gt-1]; + int ctr = 0; + for ( int idx = next.length-1; idx > gt; idx-- ) { + newTail[ctr++] = next[idx]; + } + + for ( int idx = 0; idx < newTail.length; idx++) { + next[gt+idx+1] = newTail[idx]; + } + + return next; + } + + + // before testing the dirichlet multinomial, we need to test the + // classes used to test the dirichlet multinomial + + @Test + public void testPartitioner() { + int[] numsToTest = new int[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; + int[] expectedSizes = new int[]{1, 2, 3, 5, 7, 11, 15, 22, 30, 42, 56, 77, 101, 135, 176, 231, 297, 385, 490, 627}; + for ( int testNum = 0; testNum < numsToTest.length; testNum++ ) { + PartitionGenerator gen = new PartitionGenerator(numsToTest[testNum]); + int size = 0; + while ( gen.hasNext() ) { + logger.debug(gen.dataStr()); + size += 1; + gen.next(); + } + Assert.assertEquals(size,expectedSizes[testNum], + String.format("Expected %d partitions, observed %s",expectedSizes[testNum],new PartitionGenerator(numsToTest[testNum]).toString())); + } + } + + @Test + public void testNextPermutation() { + int[] arr = new int[]{1,2,3,4}; + int[][] gens = new int[][] { + new int[]{1,2,3,4}, + new int[]{1,2,4,3}, + new int[]{1,3,2,4}, + new int[]{1,3,4,2}, + new int[]{1,4,2,3}, + new int[]{1,4,3,2}, + new int[]{2,1,3,4}, + new int[]{2,1,4,3}, + new int[]{2,3,1,4}, + new int[]{2,3,4,1}, + new int[]{2,4,1,3}, + new int[]{2,4,3,1}, + new int[]{3,1,2,4}, + new int[]{3,1,4,2}, + new int[]{3,2,1,4}, + new int[]{3,2,4,1}, + new int[]{3,4,1,2}, + new int[]{3,4,2,1}, + new int[]{4,1,2,3}, + new int[]{4,1,3,2}, + new int[]{4,2,1,3}, + new int[]{4,2,3,1}, + new int[]{4,3,1,2}, + new int[]{4,3,2,1} }; + for ( int gen = 0; gen < gens.length; gen ++ ) { + for ( int idx = 0; idx < 3; idx++ ) { + Assert.assertEquals(arr[idx],gens[gen][idx], + String.format("Error at generation %d, expected %s, observed %s",gen,Arrays.toString(gens[gen]),Arrays.toString(arr))); + } + arr = nextPermutation(arr); + } + } + + private double[] addEpsilon(double[] counts) { + double[] d = new double[counts.length]; + for ( int i = 0; i < counts.length; i ++ ) { + d[i] = counts[i] + 1e-3; + } + return d; + } + + @Test + public void testDirichletMultinomial() { + List testAlleles = Arrays.asList( + new double[]{80,240}, + new double[]{1,10000}, + new double[]{0,500}, + new double[]{5140,20480}, + new double[]{5000,800,200}, + new double[]{6,3,1000}, + new double[]{100,400,300,800}, + new double[]{8000,100,20,80,2}, + new double[]{90,20000,400,20,4,1280,720,1} + ); + + Assert.assertTrue(! Double.isInfinite(MathUtils.log10Gamma(1e-3)) && ! Double.isNaN(MathUtils.log10Gamma(1e-3))); + + int[] numAlleleSampled = new int[]{2,5,10,20,25}; + for ( double[] alleles : testAlleles ) { + for ( int count : numAlleleSampled ) { + // test that everything sums to one. Generate all multinomial draws + List likelihoods = new ArrayList<>(100000); + NextCounts generator = new NextCounts(alleles.length,count); + double maxLog = Double.MIN_VALUE; + //List countLog = new ArrayList(200); + while ( generator.hasNext() ) { + int[] thisCount = generator.next(); + //countLog.add(Arrays.toString(thisCount)); + Double likelihood = MathUtils.dirichletMultinomial(addEpsilon(alleles),thisCount); + Assert.assertTrue(! Double.isNaN(likelihood) && ! Double.isInfinite(likelihood), + String.format("Likelihood for counts %s and nAlleles %d was %s", + Arrays.toString(thisCount),alleles.length,Double.toString(likelihood))); + if ( likelihood > maxLog ) + maxLog = likelihood; + likelihoods.add(likelihood); + } + //System.out.printf("%d likelihoods and max is (probability) %e\n",likelihoods.size(),Math.pow(10,maxLog)); + Assert.assertEquals(MathUtils.sumLog10(unwrap(likelihoods)),1.0,1e-7, + String.format("Counts %d and alleles %d have nLikelihoods %d. \n Counts: %s", + count,alleles.length,likelihoods.size(), "NODEBUG"/*,countLog*/)); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/MedianUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MedianUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/MedianUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MedianUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/PathUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/PathUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java new file mode 100644 index 000000000..c8cbeeaf2 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java @@ -0,0 +1,189 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/21/12 + */ + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Basic unit test for QualityUtils class + */ +public class QualityUtilsUnitTest extends BaseTest { + final private static double TOLERANCE = 1e-9; + + @BeforeClass + public void init() { + } + + @DataProvider(name = "QualTest") + public Object[][] makeMyDataProvider() { + final List tests = new ArrayList<>(); + + for ( int qual = 0; qual < 255; qual++ ) { + tests.add(new Object[]{(byte)(qual & 0xFF), Math.pow(10.0, ((double)qual)/-10.0)}); + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "QualTest") + public void testMyData(final byte qual, final double errorRate) { + final double trueRate = 1 - errorRate; + + final double actualErrorRate = QualityUtils.qualToErrorProb(qual); + Assert.assertEquals(actualErrorRate, errorRate, TOLERANCE); + final double actualTrueRate = QualityUtils.qualToProb(qual); + Assert.assertEquals(actualTrueRate, trueRate, TOLERANCE); + + // log10 tests + final double actualLog10ErrorRate = QualityUtils.qualToErrorProbLog10(qual); + Assert.assertEquals(actualLog10ErrorRate, Math.log10(errorRate), TOLERANCE); + final double actualLog10TrueRate = QualityUtils.qualToProbLog10(qual); + Assert.assertEquals(actualLog10TrueRate, Math.log10(trueRate), TOLERANCE); + + // test that we can convert our error rates to quals, accounting for boundaries + final int expectedQual = Math.max(Math.min(qual & 0xFF, QualityUtils.MAX_SAM_QUAL_SCORE), 1); + final byte actualQual = QualityUtils.trueProbToQual(trueRate); + Assert.assertEquals(actualQual, expectedQual & 0xFF); + final byte actualQualFromErrorRate = QualityUtils.errorProbToQual(errorRate); + Assert.assertEquals(actualQualFromErrorRate, expectedQual & 0xFF); + + for ( int maxQual = 10; maxQual < QualityUtils.MAX_SAM_QUAL_SCORE; maxQual++ ) { + final byte maxAsByte = (byte)(maxQual & 0xFF); + final byte expectedQual2 = (byte)(Math.max(Math.min(qual & 0xFF, maxQual), 1) & 0xFF); + final byte actualQual2 = QualityUtils.trueProbToQual(trueRate, maxAsByte); + Assert.assertEquals(actualQual2, expectedQual2, "Failed with max " + maxQual); + final byte actualQualFromErrorRate2 = QualityUtils.errorProbToQual(errorRate, maxAsByte); + Assert.assertEquals(actualQualFromErrorRate2, expectedQual2, "Failed with max " + maxQual); + + // test the integer routines + final byte actualQualInt2 = QualityUtils.trueProbToQual(trueRate, maxQual); + Assert.assertEquals(actualQualInt2, expectedQual2, "Failed with max " + maxQual); + final byte actualQualFromErrorRateInt2 = QualityUtils.errorProbToQual(errorRate, maxQual); + Assert.assertEquals(actualQualFromErrorRateInt2, expectedQual2, "Failed with max " + maxQual); + } + } + + @Test + public void testTrueProbWithMinDouble() { + final byte actual = QualityUtils.trueProbToQual(Double.MIN_VALUE); + Assert.assertEquals(actual, 1, "Failed to convert true prob of min double to 1 qual"); + } + + @Test + public void testTrueProbWithVerySmallValue() { + final byte actual = QualityUtils.trueProbToQual(1.7857786272673852E-19); + Assert.assertEquals(actual, 1, "Failed to convert true prob of very small value 1.7857786272673852E-19 to 1 qual"); + } + + @Test + public void testQualCaches() { + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6); + + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6); + + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6); + } + + @Test() + public void testBoundingDefault() { + for ( int qual = 0; qual < 1000; qual++ ) { + final byte expected = (byte)Math.max(Math.min(qual, QualityUtils.MAX_SAM_QUAL_SCORE), 1); + Assert.assertEquals(QualityUtils.boundQual(qual), expected); + } + } + + @Test() + public void testBoundingWithMax() { + for ( int max = 10; max < 255; max += 50 ) { + for ( int qual = 0; qual < 1000; qual++ ) { + final int expected = Math.max(Math.min(qual, max), 1); + Assert.assertEquals(QualityUtils.boundQual(qual, (byte)(max & 0xFF)) & 0xFF, expected & 0xFF, "qual " + qual + " max " + max); + } + } + } + + @DataProvider(name = "PhredScaleDoubleOps") + public Object[][] makePhredDoubleTest() { + final List tests = new ArrayList<>(); + + tests.add(new Object[]{0.0, -10 * Math.log10(Double.MIN_VALUE)}); + tests.add(new Object[]{1.0, 0.0}); + for ( int pow = 1; pow < 20; pow++ ) { + tests.add(new Object[]{Math.pow(10.0, -1.0 * pow), pow * 10}); + tests.add(new Object[]{Math.pow(10.0, -1.5 * pow), pow * 15}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test() + public void testQualToErrorProbDouble() { + for ( double qual = 3.0; qual < 255.0; qual += 0.1 ) { + final double expected = Math.pow(10.0, qual / -10.0); + Assert.assertEquals(QualityUtils.qualToErrorProb(qual), expected, TOLERANCE, "failed qual->error prob for double qual " + qual); + } + } + + + @Test(dataProvider = "PhredScaleDoubleOps") + public void testPhredScaleDoubleOps(final double errorRate, final double expectedPhredScaled) { + final double actualError = QualityUtils.phredScaleErrorRate(errorRate); + Assert.assertEquals(actualError, expectedPhredScaled, TOLERANCE); + final double trueRate = 1 - errorRate; + final double actualTrue = QualityUtils.phredScaleCorrectRate(trueRate); + if ( trueRate == 1.0 ) { + Assert.assertEquals(actualTrue, QualityUtils.MIN_PHRED_SCALED_QUAL); + } else { + final double tol = errorRate < 1e-10 ? 10.0 : 1e-3; + Assert.assertEquals(actualTrue, expectedPhredScaled, tol); + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java new file mode 100644 index 000000000..85b79a00f --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -0,0 +1,179 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.lang.reflect.Field; + +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class SimpleTimerUnitTest extends BaseTest { + private final static String NAME = "unit.test.timer"; + + @Test + public void testSimpleTimer() { + SimpleTimer t = new SimpleTimer(NAME); + Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); + Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); + Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); + Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); + + t.start(); + Assert.assertTrue(t.isRunning(), "Started timer isn't running"); + Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); + Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); + long n1 = t.getElapsedTimeNano(); + double t1 = t.getElapsedTime(); + idleLoop(); // idle loop to wait a tiny bit of time + long n2 = t.getElapsedTimeNano(); + double t2 = t.getElapsedTime(); + Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); + Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); + + t.stop(); + Assert.assertFalse(t.isRunning(), "Stopped timer still running"); + long n3 = t.getElapsedTimeNano(); + double t3 = t.getElapsedTime(); + idleLoop(); // idle loop to wait a tiny bit of time + double t4 = t.getElapsedTime(); + long n4 = t.getElapsedTimeNano(); + Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); + Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); + + t.restart(); + idleLoop(); // idle loop to wait a tiny bit of time + double t5 = t.getElapsedTime(); + long n5 = t.getElapsedTimeNano(); + Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); + idleLoop(); // idle loop to wait a tiny bit of time + double t6 = t.getElapsedTime(); + long n6 = t.getElapsedTimeNano(); + Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); + Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); + Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); + Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); + + final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); + final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); + for ( int i = 0; i < nanoTimes.size(); i++ ) + Assert.assertEquals( + SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), + secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); + } + + @Test + public void testNanoResolution() { + SimpleTimer t = new SimpleTimer(NAME); + + // test the nanosecond resolution + long n7 = t.currentTimeNano(); + int sum = 0; + for ( int i = 0; i < 100; i++) sum += i; + long n8 = t.currentTimeNano(); + final long delta = n8 - n7; + final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); + logger.warn("nanoTime before nano operation " + n7); + logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); + Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); + Assert.assertTrue(delta < oneMilliInNano, + "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); + } + + @Test + public void testMeaningfulTimes() { + SimpleTimer t = new SimpleTimer(NAME); + + t.start(); + for ( int i = 0; i < 100; i++ ) ; + long nano = t.getElapsedTimeNano(); + double secs = t.getElapsedTime(); + + Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); + Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); + + Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); + final long maxTimeInMicro = 10000; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(maxTimeInMicro); + Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); + } + + @Test + public void testCheckpointRestart() throws Exception { + SimpleTimer t = new SimpleTimer(NAME); + + final Field offsetField = t.getClass().getDeclaredField("nanoTimeOffset"); + offsetField.setAccessible(true); + long offset = ((Long) offsetField.get(t)).longValue(); + + t.start(); + idleLoop(); + // Make it as if clock has jumped into the past + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(10)); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + Assert.assertEquals(t.getElapsedTime(), 0.0, "Time over restart is not zero."); + + t.start(); + idleLoop(); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + double elapsed = t.getElapsedTime(); + Assert.assertTrue(elapsed >= 0.0, "Elapsed time is zero."); + t.restart(); + // Make the clock jump again by just a little + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(1)); + idleLoop(); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + Assert.assertTrue(t.getElapsedTime() > elapsed, "Small clock drift causing reset."); + elapsed = t.getElapsedTime(); + // Now a bigger jump, into the future this time. + t.restart(); + // Make the clock jump again by a lot + offsetField.set(t, offset - TimeUnit.SECONDS.toNanos(10)); + t.stop(); + Assert.assertEquals(t.getElapsedTime(), elapsed, "Time added over checkpoint/restart."); + + // Test without stopping + t.start(); + offset = ((Long) offsetField.get(t)).longValue(); + // Make it as if clock has jumped into the past + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(10)); + Assert.assertEquals(t.getElapsedTime(), 0.0, "Elapsed time after C/R is not zero."); + idleLoop(); + Assert.assertTrue(t.getElapsedTime() > 0.0, "Elapsed time zero after re-sync."); + + } + + private static void idleLoop() { + for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/baq/BAQUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/baq/BAQUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java new file mode 100644 index 000000000..6dd4d8104 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -0,0 +1,144 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.clipping; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; + +import java.util.LinkedList; +import java.util.List; +import java.util.Stack; + +public class ReadClipperTestUtils { + //Should contain all the utils needed for tests to mass produce + //reads, cigars, and other needed classes + + final static byte [] BASES = {'A', 'C', 'T', 'G'}; + final static byte [] QUALS = {2, 15, 25, 30}; + final static String CIGAR = "4M"; + final static CigarElement[] cigarElements = { new CigarElement(1, CigarOperator.HARD_CLIP), + new CigarElement(1, CigarOperator.SOFT_CLIP), + new CigarElement(1, CigarOperator.INSERTION), + new CigarElement(1, CigarOperator.DELETION), + new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH)}; + + + public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { + return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); + } + + public static GATKSAMRecord makeReadFromCigar(String cigarString) { + return makeReadFromCigar(CigarUtils.cigarFromString(cigarString)); + } + + public static List generateCigarList(int maximumLength) { + return generateCigarList(maximumLength, cigarElements); + } + + /** + * This function generates every valid permutation of cigar strings (with a given set of cigarElement) with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * - No consecutive I/D elements + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ + public static List generateCigarList(int maximumLength, CigarElement[] cigarElements) { + int numCigarElements = cigarElements.length; + LinkedList cigarList = new LinkedList(); + byte [] cigarCombination = new byte[maximumLength]; + + Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array. + int currentIndex = 0; + while (true) { + Cigar cigar = createCigarFromCombination(cigarCombination, cigarElements); // create the cigar + cigar = CigarUtils.combineAdjacentCigarElements(cigar); // combine adjacent elements + if (CigarUtils.isCigarValid(cigar)) { // check if it's valid + cigarList.add(cigar); // add it + } + + boolean currentIndexChanged = false; + while (currentIndex < maximumLength && cigarCombination[currentIndex] == numCigarElements - 1) { + currentIndex++; // find the next index to increment + currentIndexChanged = true; // keep track of the fact that we have changed indices! + } + + if (currentIndex == maximumLength) // if we hit the end of the array, we're done. + break; + + cigarCombination[currentIndex]++; // otherwise advance the current index + + if (currentIndexChanged) { // if we have changed index, then... + for (int i = 0; i < currentIndex; i++) + cigarCombination[i] = 0; // reset everything from 0->currentIndex + currentIndex = 0; // go back to the first index + } + } + + return cigarList; + } + + private static Cigar createCigarFromCombination(byte[] cigarCombination, CigarElement[] cigarElements) { + Cigar cigar = new Cigar(); + for (byte i : cigarCombination) { + cigar.add(cigarElements[i]); + } + return cigar; + } + + public static GATKSAMRecord makeRead() { + return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + } + + /** + * Asserts that the two reads have the same bases, qualities and cigar strings + * + * @param actual the calculated read + * @param expected the expected read + */ + public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { + // If they're both not empty, test their contents + if(!actual.isEmpty() && !expected.isEmpty()) { + Assert.assertEquals(actual.getReadBases(), expected.getReadBases()); + Assert.assertEquals(actual.getBaseQualities(), expected.getBaseQualities()); + Assert.assertEquals(actual.getCigarString(), expected.getCigarString()); + } + // Otherwise test if they're both empty + else + Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); + } +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java new file mode 100644 index 000000000..cd12c3b9b --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -0,0 +1,421 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.clipping; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +/** + * User: roger + * Date: 9/28/11 + */ +public class ReadClipperUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + List cigarList; + int maximumCigarSize = 10; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 + + @BeforeClass + public void init() { + cigarList = ReadClipperTestUtils.generateCigarList(maximumCigarSize); + } + + @Test(enabled = !DEBUG) + public void testHardClipBothEndsByReferenceCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + int readLength = alnStart - alnEnd; + for (int i = 0; i < readLength / 2; i++) { + GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); + Assert.assertTrue(clippedRead.getAlignmentStart() >= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); + } + } + } + + @Test(enabled = !DEBUG) + public void testHardClipByReadCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + for (int i = 0; i < readLength; i++) { + GATKSAMRecord clipLeft = ReadClipper.hardClipByReadCoordinates(read, 0, i); + Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); + + GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1); + Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); + } + } + } + + @DataProvider(name = "ClippedReadLengthData") + public Object[][] makeClippedReadLengthData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final int originalReadLength = 50; + for ( int nToClip = 1; nToClip < originalReadLength - 1; nToClip++ ) { + tests.add(new Object[]{originalReadLength, nToClip}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ClippedReadLengthData", enabled = !DEBUG) + public void testHardClipReadLengthIsRight(final int originalReadLength, final int nToClip) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(originalReadLength + "M"); + read.getReadLength(); // provoke the caching of the read length + final int expectedReadLength = originalReadLength - nToClip; + GATKSAMRecord clipped = ReadClipper.hardClipByReadCoordinates(read, 0, nToClip - 1); + Assert.assertEquals(clipped.getReadLength(), expectedReadLength, + String.format("Clipped read length %d with cigar %s not equal to the expected read length %d after clipping %d bases from the left from a %d bp read with cigar %s", + clipped.getReadLength(), clipped.getCigar(), expectedReadLength, nToClip, read.getReadLength(), read.getCigar())); + } + + @Test(enabled = !DEBUG) + public void testHardClipByReferenceCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int start = read.getSoftStart(); + int stop = read.getSoftEnd(); + + for (int i = start; i <= stop; i++) { + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i); + if (!clipLeft.isEmpty()) { + Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); + } + + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); + } + } + } + } + + @Test(enabled = !DEBUG) + public void testHardClipByReferenceCoordinatesLeftTail() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + for (int i = alnStart; i <= alnEnd; i++) { + GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); + + if (!clipLeft.isEmpty()) { + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); + } + } + } + } + } + + @Test(enabled = !DEBUG) + public void testHardClipByReferenceCoordinatesRightTail() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + for (int i = alnStart; i <= alnEnd; i++) { + GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); + } + } + } + } + } + + @Test(enabled = !DEBUG) + public void testHardClipLowQualEnds() { + final byte LOW_QUAL = 2; + final byte HIGH_QUAL = 30; + + /** create a read for every cigar permutation */ + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + byte[] quals = new byte[readLength]; + + for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { + + /** create a read with nLowQualBases in the left tail */ + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) + quals[addLeft] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); + checkClippedReadsForLowQualEnds(read, clipLeft, LOW_QUAL, nLowQualBases); + + /** create a read with nLowQualBases in the right tail */ + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addRight = 0; addRight < nLowQualBases; addRight++) + quals[readLength - addRight - 1] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); + checkClippedReadsForLowQualEnds(read, clipRight, LOW_QUAL, nLowQualBases); + + /** create a read with nLowQualBases on both tails */ + if (nLowQualBases <= readLength / 2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { + quals[addBoth] = LOW_QUAL; + quals[readLength - addBoth - 1] = LOW_QUAL; + } + read.setBaseQualities(quals); + GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); + checkClippedReadsForLowQualEnds(read, clipBoth, LOW_QUAL, 2*nLowQualBases); + } + } + } + } + + @Test(enabled = !DEBUG) + public void testHardClipSoftClippedBases() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); + CigarCounter original = new CigarCounter(read); + CigarCounter clipped = new CigarCounter(clippedRead); + + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS + } + } + + @Test(enabled = false) + public void testHardClipLeadingInsertions() { + for (Cigar cigar : cigarList) { + if (startsWithInsertion(cigar)) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); + + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); + if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) + expectedLength -= leadingCigarElementLength(CigarUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); + + if (!clippedRead.isEmpty()) { + Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there + Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone + } else + Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped + } + } + } + + @Test(enabled = !DEBUG) + public void testRevertSoftClippedBases() { + for (Cigar cigar : cigarList) { + final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(CigarUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); + + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + + if (leadingSoftClips > 0 || tailSoftClips > 0) { + final int expectedStart = read.getAlignmentStart() - leadingSoftClips; + final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; + + Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); + Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); + } else + Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); + } + } + + @Test(enabled = !DEBUG) + public void testRevertSoftClippedBasesWithThreshold() { + for (Cigar cigar : cigarList) { + final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(CigarUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); + + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + Assert.assertNull(read.getCigar().isValid(null, -1)); + Assert.assertNull(unclipped.getCigar().isValid(null, -1)); + + if (!(leadingSoftClips > 0 || tailSoftClips > 0)) + Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); + + } + } + + @DataProvider(name = "RevertSoftClipsBeforeContig") + public Object[][] makeRevertSoftClipsBeforeContig() { + List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + for ( int softStart : Arrays.asList(-10, -1, 0) ) { + for ( int alignmentStart : Arrays.asList(1, 10) ) { + tests.add(new Object[]{softStart, alignmentStart}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "RevertSoftClipsBeforeContig") + public void testRevertSoftClippedBasesBeforeStartOfContig(final int softStart, final int alignmentStart) { + final int nMatches = 10; + final int nSoft = -1 * (softStart - alignmentStart); + final String cigar = nSoft + "S" + nMatches + "M"; + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + read.setAlignmentStart(alignmentStart); + + Assert.assertEquals(read.getSoftStart(), softStart); + Assert.assertEquals(read.getAlignmentStart(), alignmentStart); + Assert.assertEquals(read.getCigarString(), cigar); + + final GATKSAMRecord reverted = ReadClipper.revertSoftClippedBases(read); + + final int expectedAlignmentStart = 1; + final String expectedCigar = (1 - softStart) + "H" + read.getAlignmentEnd() + "M"; + Assert.assertEquals(reverted.getSoftStart(), expectedAlignmentStart); + Assert.assertEquals(reverted.getAlignmentStart(), expectedAlignmentStart); + Assert.assertEquals(reverted.getCigarString(), expectedCigar); + } + + private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { + if (!read.isEmpty()) { + byte[] quals = read.getBaseQualities(); + for (int i = 0; i < quals.length; i++) + Assert.assertFalse(quals[i] <= low_qual, String.format("Found low qual (%d) base after hard clipping. Position: %d -- %s", low_qual, i, read.getCigarString())); + } + } + + private void checkClippedReadsForLowQualEnds(GATKSAMRecord read, GATKSAMRecord clippedRead, byte lowQual, int nLowQualBases) { + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + assertNoLowQualBases(clippedRead, lowQual); // Make sure the low qualities are gone + } + + /** + * Asserts that clipping doesn't change the getUnclippedStart / getUnclippedEnd + * + * @param original original read + * @param clipped clipped read + */ + private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) { + if (CigarUtils.readHasNonClippedBases(clipped)) { + Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart()); + Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd()); + } + } + + private boolean startsWithInsertion(Cigar cigar) { + return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0; + } + + private int leadingCigarElementLength(Cigar cigar, CigarOperator operator) { + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (cigarElement.getOperator() == operator) + return cigarElement.getLength(); + if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) + break; + } + return 0; + } + + private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) { + for (CigarElement cigarElement : cigar.getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } + + private class CigarCounter { + private HashMap counter; + + public Integer getCounterForOp(CigarOperator operator) { + return counter.get(operator); + } + + public CigarCounter(GATKSAMRecord read) { + CigarOperator[] operators = CigarOperator.values(); + counter = new HashMap(operators.length); + + for (CigarOperator op : operators) + counter.put(op, 0); + + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength()); + } + + public boolean assertHardClippingSoftClips(CigarCounter clipped) { + for (CigarOperator op : counter.keySet()) { + if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) { + int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP); + int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP); + int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP); + + Assert.assertEquals(counterTotal, clippedHard); + Assert.assertTrue(clippedSoft == 0); + } else + Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op)); + } + return true; + } + + } + + @Test(enabled = !DEBUG) + public void testRevertEntirelySoftclippedReads() { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("2H1S3H"); + GATKSAMRecord clippedRead = ReadClipper.revertSoftClippedBases(read); + Assert.assertEquals(clippedRead.getAlignmentStart(), read.getSoftStart()); + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java new file mode 100644 index 000000000..7eca44ee6 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java @@ -0,0 +1,326 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.io; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class IOUtilsUnitTest extends BaseTest { + @Test + public void testGoodTempDir() { + IOUtils.checkTempDir(new File("/tmp/queue")); + } + + @Test(expectedExceptions=UserException.BadTmpDir.class) + public void testBadTempDir() { + IOUtils.checkTempDir(new File("/tmp")); + } + + @Test + public void testAbsoluteSubDir() { + File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File(".")); + Assert.assertEquals(subDir, new File("/different/path")); + } + + @Test + public void testRelativeSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/path/path/to/file")); + } + + @Test + public void testDottedSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + + subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + } + + @Test + public void testTempDir() { + File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); + Assert.assertTrue(tempDir.exists()); + Assert.assertFalse(tempDir.isFile()); + Assert.assertTrue(tempDir.isDirectory()); + boolean deleted = IOUtils.tryDelete(tempDir); + Assert.assertTrue(deleted); + Assert.assertFalse(tempDir.exists()); + } + + @Test + public void testDirLevel() { + File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); + Assert.assertEquals(dir, new File("/path")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); + Assert.assertEquals(dir, new File("/path/to")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); + Assert.assertEquals(dir, new File("/path/to/directory")); + } + + @Test + public void testAbsolute() { + File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.absolute(new File("/")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/././.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/./directory/.")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory/./")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory./")); + Assert.assertEquals(dir, new File("/directory.")); + + dir = IOUtils.absolute(new File("/./.directory/")); + Assert.assertEquals(dir, new File("/.directory")); + } + + @Test + public void testTail() throws IOException { + List lines = Arrays.asList( + "chr18_random 4262 3154410390 50 51", + "chr19_random 301858 3154414752 50 51", + "chr21_random 1679693 3154722662 50 51", + "chr22_random 257318 3156435963 50 51", + "chrX_random 1719168 3156698441 50 51"); + List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); + Assert.assertEquals(tail.size(), 5); + for (int i = 0; i < 5; i++) + Assert.assertEquals(tail.get(i), lines.get(i)); + } + + @Test + public void testWriteSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("testProperties.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteSystemTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("testProperties.properties", null)); + try { + Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("/testProperties.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("/testProperties.properties", IOUtils.class)); + try { + Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + // Looking for /org/broadinstitute/sting/utils/file/StingText.properties + IOUtils.writeResource(new Resource("StingText.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testResourceProperties() { + Resource resource = new Resource("foo", Resource.class); + Assert.assertEquals(resource.getPath(), "foo"); + Assert.assertEquals(resource.getRelativeClass(), Resource.class); + } + + @Test + public void testIsSpecialFile() { + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); + Assert.assertFalse(IOUtils.isSpecialFile(null)); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); + } + + @DataProvider( name = "ByteArrayIOTestData") + public Object[][] byteArrayIOTestDataProvider() { + return new Object[][] { + // file size, read buffer size + { 0, 4096 }, + { 1, 4096 }, + { 2000, 4096 }, + { 4095, 4096 }, + { 4096, 4096 }, + { 4097, 4096 }, + { 6000, 4096 }, + { 8191, 4096 }, + { 8192, 4096 }, + { 8193, 4096 }, + { 10000, 4096 } + }; + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToFile(dataWritten, tempFile); + byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); + byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentFileIntoByteArray() { + File nonExistentFile = new File("djfhsdkjghdfk"); + Assert.assertFalse(nonExistentFile.exists()); + + IOUtils.readFileIntoByteArray(nonExistentFile); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadNullStreamIntoByteArray() { + IOUtils.readStreamIntoByteArray(null); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { + IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), + -1); + } + + @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) + public void testWriteByteArrayToUncreatableFile() { + IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteNullByteArrayToFile() { + IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteByteArrayToNullStream() { + IOUtils.writeByteArrayToStream(new byte[]{0}, null); + } + + private byte[] getDeterministicRandomData ( int size ) { + GenomeAnalysisEngine.resetRandomGenerator(); + Random rand = GenomeAnalysisEngine.getRandomGenerator(); + + byte[] randomData = new byte[size]; + rand.nextBytes(randomData); + + return randomData; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java new file mode 100644 index 000000000..39058233e --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java @@ -0,0 +1,189 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pileup; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; +import org.broadinstitute.sting.utils.locusiterator.LIBS_position; +import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByStateBaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * testing of the new (non-legacy) version of LocusIteratorByState + */ +public class PileupElementUnitTest extends LocusIteratorByStateBaseTest { + @DataProvider(name = "PileupElementTest") + public Object[][] makePileupElementTest() { +// return new Object[][]{{new LIBSTest("2X2D2P2X")}}; +// return createLIBSTests( +// Arrays.asList(2), +// Arrays.asList(2)); + return createLIBSTests( + Arrays.asList(1, 2), + Arrays.asList(1, 2, 3, 4)); + } + + @Test(dataProvider = "PileupElementTest") + public void testPileupElementTest(LIBSTest params) { + final GATKSAMRecord read = params.makeRead(); + final AlignmentStateMachine state = new AlignmentStateMachine(read); + final LIBS_position tester = new LIBS_position(read); + + while ( state.stepForwardOnGenome() != null ) { + tester.stepForwardOnGenome(); + final PileupElement pe = state.makePileupElement(); + + Assert.assertEquals(pe.getRead(), read); + Assert.assertEquals(pe.getMappingQual(), read.getMappingQuality()); + Assert.assertEquals(pe.getOffset(), state.getReadOffset()); + + Assert.assertEquals(pe.isDeletion(), state.getCigarOperator() == CigarOperator.D); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); + + if ( ! hasNeighboringPaddedOps(params.getElements(), pe.getCurrentCigarOffset()) ) { + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); + } + + + + Assert.assertEquals(pe.atEndOfCurrentCigar(), state.getOffsetIntoCurrentCigarElement() == state.getCurrentCigarElement().getLength() - 1, "atEndOfCurrentCigar failed"); + Assert.assertEquals(pe.atStartOfCurrentCigar(), state.getOffsetIntoCurrentCigarElement() == 0, "atStartOfCurrentCigar failed"); + + Assert.assertEquals(pe.getBase(), pe.isDeletion() ? PileupElement.DELETION_BASE : read.getReadBases()[state.getReadOffset()]); + Assert.assertEquals(pe.getQual(), pe.isDeletion() ? PileupElement.DELETION_QUAL : read.getBaseQualities()[state.getReadOffset()]); + + Assert.assertEquals(pe.getCurrentCigarElement(), state.getCurrentCigarElement()); + Assert.assertEquals(pe.getCurrentCigarOffset(), state.getCurrentCigarElementOffset()); + + // tested in libs + //pe.getLengthOfImmediatelyFollowingIndel(); + //pe.getBasesOfImmediatelyFollowingInsertion(); + + // Don't test -- pe.getBaseIndex(); + if ( pe.atEndOfCurrentCigar() && state.getCurrentCigarElementOffset() < read.getCigarLength() - 1 ) { + final CigarElement nextElement = read.getCigar().getCigarElement(state.getCurrentCigarElementOffset() + 1); + if ( nextElement.getOperator() == CigarOperator.I ) { + Assert.assertTrue(pe.getBetweenNextPosition().size() >= 1); + Assert.assertEquals(pe.getBetweenNextPosition().get(0), nextElement); + } + if ( nextElement.getOperator() == CigarOperator.M ) { + Assert.assertTrue(pe.getBetweenNextPosition().isEmpty()); + } + } else { + Assert.assertTrue(pe.getBetweenNextPosition().isEmpty()); + } + + if ( pe.atStartOfCurrentCigar() && state.getCurrentCigarElementOffset() > 0 ) { + final CigarElement prevElement = read.getCigar().getCigarElement(state.getCurrentCigarElementOffset() - 1); + if ( prevElement.getOperator() == CigarOperator.I ) { + Assert.assertTrue(pe.getBetweenPrevPosition().size() >= 1); + Assert.assertEquals(pe.getBetweenPrevPosition().getLast(), prevElement); + } + if ( prevElement.getOperator() == CigarOperator.M ) { + Assert.assertTrue(pe.getBetweenPrevPosition().isEmpty()); + } + } else { + Assert.assertTrue(pe.getBetweenPrevPosition().isEmpty()); + } + + // TODO -- add meaningful tests + pe.getBaseInsertionQual(); + pe.getBaseDeletionQual(); + } + } + + + @DataProvider(name = "PrevAndNextTest") + public Object[][] makePrevAndNextTest() { + final List tests = new LinkedList(); + + final List operators = Arrays.asList(CigarOperator.I, CigarOperator.P, CigarOperator.S); + + for ( final CigarOperator firstOp : Arrays.asList(CigarOperator.M) ) { + for ( final CigarOperator lastOp : Arrays.asList(CigarOperator.M, CigarOperator.D) ) { + for ( final int nIntermediate : Arrays.asList(1, 2, 3) ) { + for ( final List combination : Utils.makePermutations(operators, nIntermediate, false) ) { + final int readLength = 2 + combination.size(); + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); + + String cigar = "1" + firstOp; + for ( final CigarOperator op : combination ) cigar += "1" + op; + cigar += "1" + lastOp; + read.setCigarString(cigar); + + tests.add(new Object[]{read, firstOp, lastOp, combination}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrevAndNextTest") + public void testPrevAndNextTest(final GATKSAMRecord read, final CigarOperator firstOp, final CigarOperator lastOp, final List ops) { + final AlignmentStateMachine state = new AlignmentStateMachine(read); + + state.stepForwardOnGenome(); + final PileupElement pe = state.makePileupElement(); + Assert.assertEquals(pe.getBetweenNextPosition().size(), ops.size()); + Assert.assertEquals(pe.getBetweenPrevPosition().size(), 0); + assertEqualsOperators(pe.getBetweenNextPosition(), ops); + Assert.assertEquals(pe.getPreviousOnGenomeCigarElement(), null); + Assert.assertNotNull(pe.getNextOnGenomeCigarElement()); + Assert.assertEquals(pe.getNextOnGenomeCigarElement().getOperator(), lastOp); + + state.stepForwardOnGenome(); + final PileupElement pe2 = state.makePileupElement(); + Assert.assertEquals(pe2.getBetweenPrevPosition().size(), ops.size()); + Assert.assertEquals(pe2.getBetweenNextPosition().size(), 0); + assertEqualsOperators(pe2.getBetweenPrevPosition(), ops); + Assert.assertNotNull(pe2.getPreviousOnGenomeCigarElement()); + Assert.assertEquals(pe2.getPreviousOnGenomeCigarElement().getOperator(), firstOp); + Assert.assertEquals(pe2.getNextOnGenomeCigarElement(), null); + } + + private void assertEqualsOperators(final List elements, final List ops) { + for ( int i = 0; i < elements.size(); i++ ) { + Assert.assertEquals(elements.get(i).getOperator(), ops.get(i), "elements doesn't have expected operator at position " + i); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java new file mode 100644 index 000000000..2ede67a3c --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java @@ -0,0 +1,121 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * UnitTests for the ProgressMeterDaemon + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDaemonUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + // capture and count calls to progress + private class TestingProgressMeter extends ProgressMeter { + final List progressCalls = new LinkedList(); + + private TestingProgressMeter(final long poll) { + super(null, "test", new GenomeLocSortedSet(genomeLocParser), poll); + super.start(); + } + + @Override + protected synchronized void printProgress(boolean mustPrint) { + progressCalls.add(System.currentTimeMillis()); + } + } + + @DataProvider(name = "PollingData") + public Object[][] makePollingData() { + List tests = new ArrayList(); + for ( final int ticks : Arrays.asList(1, 5, 10) ) { + for ( final int poll : Arrays.asList(10, 100) ) { + tests.add(new Object[]{poll, ticks}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test + public void testPeriodUpdateNano() { + final ProgressMeter meter = new TestingProgressMeter(10); + final long currentTime = meter.getRuntimeInNanoseconds(); + meter.updateElapsedTimeInNanoseconds(); + Assert.assertTrue( meter.getRuntimeInNanosecondsUpdatedPeriodically() > currentTime, "Updating the periodic runtime failed" ); + } + + @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90, enabled = false) + public void testProgressMeterDaemon(final long poll, final int ticks) throws InterruptedException { + final TestingProgressMeter meter = new TestingProgressMeter(poll); + final ProgressMeterDaemon daemon = meter.getProgressMeterDaemon(); + + Assert.assertTrue(daemon.isDaemon()); + + Assert.assertFalse(daemon.isDone()); + Thread.sleep(ticks * poll); + Assert.assertFalse(daemon.isDone()); + + daemon.done(); + Assert.assertTrue(daemon.isDone()); + + // wait for the thread to actually finish + daemon.join(); + + Assert.assertTrue(meter.progressCalls.size() >= 1, + "Expected at least one progress update call from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); + + final int tolerance = (int)Math.ceil(0.8 * meter.progressCalls.size()); + Assert.assertTrue(Math.abs(meter.progressCalls.size() - ticks) <= tolerance, + "Expected " + ticks + " progress calls from daemon thread, but got " + meter.progressCalls.size() + " and a tolerance of only " + tolerance); + + Assert.assertTrue(meter.getRuntimeInNanosecondsUpdatedPeriodically() > 0, "Daemon should have updated our periodic runtime"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java new file mode 100644 index 000000000..837f3fa45 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -0,0 +1,78 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +public class GATKSAMRecordUnitTest extends BaseTest { + GATKSAMRecord read; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; + + @BeforeClass + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + } + + @Test + public void testStrandlessReads() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + Assert.assertEquals(read.isStrandless(), false); + + read.setReadNegativeStrandFlag(false); + Assert.assertEquals(read.isStrandless(), false); + Assert.assertEquals(read.getReadNegativeStrandFlag(), false); + + read.setReadNegativeStrandFlag(true); + Assert.assertEquals(read.isStrandless(), false); + Assert.assertEquals(read.getReadNegativeStrandFlag(), true); + + read.setReadNegativeStrandFlag(true); + read.setIsStrandless(true); + Assert.assertEquals(read.isStrandless(), true); + Assert.assertEquals(read.getReadNegativeStrandFlag(), false, "negative strand flag should return false even through its set for a strandless read"); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testStrandlessReadsFailSetStrand() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + read.setIsStrandless(true); + read.setReadNegativeStrandFlag(true); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java new file mode 100644 index 000000000..5732e5746 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -0,0 +1,340 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +public class ReadUtilsUnitTest extends BaseTest { + private interface GetAdaptorFunc { + public int getAdaptor(final GATKSAMRecord record); + } + + @DataProvider(name = "AdaptorGetter") + public Object[][] makeActiveRegionCutTests() { + final List tests = new LinkedList(); + + tests.add( new Object[]{ new GetAdaptorFunc() { + @Override public int getAdaptor(final GATKSAMRecord record) { return ReadUtils.getAdaptorBoundary(record); } + }}); + + tests.add( new Object[]{ new GetAdaptorFunc() { + @Override public int getAdaptor(final GATKSAMRecord record) { return record.getAdaptorBoundary(); } + }}); + + return tests.toArray(new Object[][]{}); + } + + private GATKSAMRecord makeRead(final int fragmentSize, final int mateStart) { + final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; + final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; + final String cigar = "8M"; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); + read.setProperPairFlag(true); + read.setReadPairedFlag(true); + read.setMateAlignmentStart(mateStart); + read.setInferredInsertSize(fragmentSize); + return read; + } + + @Test(dataProvider = "AdaptorGetter") + public void testGetAdaptorBoundary(final GetAdaptorFunc get) { + final int fragmentSize = 10; + final int mateStart = 1000; + final int BEFORE = mateStart - 2; + final int AFTER = mateStart + 2; + int myStart, boundary; + GATKSAMRecord read; + + // Test case 1: positive strand, first read + read = makeRead(fragmentSize, mateStart); + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 2: positive strand, second read + read = makeRead(fragmentSize, mateStart); + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 3: negative strand, second read + read = makeRead(fragmentSize, mateStart); + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, mateStart - 1); + + // Test case 4: negative strand, first read + read = makeRead(fragmentSize, mateStart); + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, mateStart - 1); + + // Test case 5: mate is mapped to another chromosome (test both strands) + read = makeRead(fragmentSize, mateStart); + read.setInferredInsertSize(0); + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + read.setInferredInsertSize(10); + + // Test case 6: read is unmapped + read = makeRead(fragmentSize, mateStart); + read.setReadUnmappedFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + read.setReadUnmappedFlag(false); + + // Test case 7: reads don't overlap and look like this: + // <--------| + // |------> + // first read: + read = makeRead(fragmentSize, mateStart); + myStart = 980; + read.setAlignmentStart(myStart); + read.setInferredInsertSize(20); + read.setReadNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + + // second read: + read = makeRead(fragmentSize, mateStart); + myStart = 1000; + read.setAlignmentStart(myStart); + read.setInferredInsertSize(20); + read.setMateAlignmentStart(980); + read.setReadNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + + // Test case 8: read doesn't have proper pair flag set + read = makeRead(fragmentSize, mateStart); + read.setReadPairedFlag(true); + read.setProperPairFlag(false); + Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + + // Test case 9: read and mate have same negative flag setting + for ( final boolean negFlag: Arrays.asList(true, false) ) { + read = makeRead(fragmentSize, mateStart); + read.setAlignmentStart(BEFORE); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadNegativeStrandFlag(negFlag); + read.setMateNegativeStrandFlag(!negFlag); + Assert.assertTrue(get.getAdaptor(read) != ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have succeeded"); + + read = makeRead(fragmentSize, mateStart); + read.setAlignmentStart(BEFORE); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadNegativeStrandFlag(negFlag); + read.setMateNegativeStrandFlag(negFlag); + Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have failed for reads with bad alignment orientation"); + } + } + + @Test (enabled = true) + public void testGetBasesReverseComplement() { + int iterations = 1000; + Random random = GenomeAnalysisEngine.getRandomGenerator(); + while(iterations-- > 0) { + final int l = random.nextInt(1000); + GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); + byte [] original = read.getReadBases(); + byte [] reconverted = new byte[l]; + String revComp = ReadUtils.getBasesReverseComplement(read); + for (int i=0; i reads = new ArrayList(); + for( int readLength = minLength; readLength <= maxLength; readLength++ ) { + reads.add( ReadUtils.createRandomRead( readLength ) ); + } + Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match"); + } + } + + final List reads = new LinkedList(); + Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); + } + + @Test (enabled = true) + public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 2); + } + + @Test (enabled = true) + public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9393, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 3); + } + + @DataProvider(name = "HasWellDefinedFragmentSizeData") + public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { + final List tests = new LinkedList(); + + // setup a basic read that will work + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadUnmappedFlag(false); + read.setMateUnmappedFlag(false); + read.setAlignmentStart(100); + read.setCigarString("50M"); + read.setMateAlignmentStart(130); + read.setInferredInsertSize(80); + read.setFirstOfPairFlag(true); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + + tests.add( new Object[]{ "basic case", read.clone(), true }); + + { + final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone(); + bad1.setReadPairedFlag(false); + tests.add( new Object[]{ "not paired", bad1, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setProperPairFlag(false); + // we currently don't require the proper pair flag to be set + tests.add( new Object[]{ "not proper pair", bad, true }); +// tests.add( new Object[]{ "not proper pair", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadUnmappedFlag(true); + tests.add( new Object[]{ "read is unmapped", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setMateUnmappedFlag(true); + tests.add( new Object[]{ "mate is unmapped", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setMateNegativeStrandFlag(false); + tests.add( new Object[]{ "read and mate both on positive strand", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadNegativeStrandFlag(true); + tests.add( new Object[]{ "read and mate both on negative strand", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setInferredInsertSize(0); + tests.add( new Object[]{ "insert size is 0", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setAlignmentStart(1000); + tests.add( new Object[]{ "positve read starts after mate end", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadNegativeStrandFlag(true); + bad.setMateNegativeStrandFlag(false); + bad.setMateAlignmentStart(1000); + tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "HasWellDefinedFragmentSizeData") + private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) { + Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java new file mode 100644 index 000000000..efc701a6d --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -0,0 +1,1741 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class GATKVariantContextUtilsUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + Allele ATCATCT; + Allele ATref; + Allele Anoref; + Allele GT; + + @BeforeSuite + public void setup() { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + ATCATCT = Allele.create("ATCATCT"); + ATref = Allele.create("AT",true); + Anoref = Allele.create("A",false); + GT = Allele.create("GT",false); + } + + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); + } + + + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).make(); + } + + private VariantContext makeVC(String source, List alleles) { + return makeVC(source, alleles, null, null); + } + + private VariantContext makeVC(String source, List alleles, Genotype... g1) { + return makeVC(source, alleles, Arrays.asList(g1)); + } + + private VariantContext makeVC(String source, List alleles, String filter) { + return makeVC(source, alleles, filter.equals(".") ? null : new HashSet(Arrays.asList(filter))); + } + + private VariantContext makeVC(String source, List alleles, Set filters) { + return makeVC(source, alleles, null, filters); + } + + private VariantContext makeVC(String source, List alleles, Collection genotypes) { + return makeVC(source, alleles, genotypes, null); + } + + private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { + int start = 10; + int stop = start + alleles.get(0).length() - 1; // alleles.contains(ATC) ? start + 3 : start; + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); + } + + // -------------------------------------------------------------------------------- + // + // Test allele merging + // + // -------------------------------------------------------------------------------- + + private class MergeAllelesTest extends TestDataProvider { + List> inputs; + List expected; + + private MergeAllelesTest(List... arg) { + super(MergeAllelesTest.class); + LinkedList> all = new LinkedList<>(Arrays.asList(arg)); + expected = all.pollLast(); + inputs = all; + } + + public String toString() { + return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected); + } + } + @DataProvider(name = "mergeAlleles") + public Object[][] mergeAllelesData() { + // first, do no harm + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref), + Arrays.asList(Aref)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, T), + Arrays.asList(Aref, T)); + + new MergeAllelesTest(Arrays.asList(Aref, C), + Arrays.asList(Aref, T), + Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, T), + Arrays.asList(Aref, C), + Arrays.asList(Aref, T, C)); // in order of appearence + + new MergeAllelesTest(Arrays.asList(Aref, C, T), + Arrays.asList(Aref, C), + Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); + + new MergeAllelesTest(Arrays.asList(Aref, T, C), + Arrays.asList(Aref, C), + Arrays.asList(Aref, T, C)); // in order of appearence + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, ATC), + Arrays.asList(Aref, ATC)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, ATC, ATCATC), + Arrays.asList(Aref, ATC, ATCATC)); + + // alleles in the order we see them + new MergeAllelesTest(Arrays.asList(Aref, ATCATC), + Arrays.asList(Aref, ATC, ATCATC), + Arrays.asList(Aref, ATCATC, ATC)); + + // same + new MergeAllelesTest(Arrays.asList(Aref, ATC), + Arrays.asList(Aref, ATCATC), + Arrays.asList(Aref, ATC, ATCATC)); + + new MergeAllelesTest(Arrays.asList(ATref, ATC, Anoref, G), + Arrays.asList(Aref, ATCATC, G), + Arrays.asList(ATref, ATC, Anoref, G, ATCATCT, GT)); + + return MergeAllelesTest.getTests(MergeAllelesTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") + public void testMergeAlleles(MergeAllelesTest cfg) { + final List inputs = new ArrayList(); + + int i = 0; + for ( final List alleles : cfg.inputs ) { + final String name = "vcf" + ++i; + inputs.add(makeVC(name, alleles)); + } + + final List priority = vcs2priority(inputs); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + inputs, priority, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + + Assert.assertEquals(merged.getAlleles().size(),cfg.expected.size()); + Assert.assertEquals(merged.getAlleles(), cfg.expected); + } + + // -------------------------------------------------------------------------------- + // + // Test rsID merging + // + // -------------------------------------------------------------------------------- + + private class SimpleMergeRSIDTest extends TestDataProvider { + List inputs; + String expected; + + private SimpleMergeRSIDTest(String... arg) { + super(SimpleMergeRSIDTest.class); + LinkedList allStrings = new LinkedList(Arrays.asList(arg)); + expected = allStrings.pollLast(); + inputs = allStrings; + } + + public String toString() { + return String.format("SimpleMergeRSIDTest vc=%s expected=%s", inputs, expected); + } + } + + @DataProvider(name = "simplemergersiddata") + public Object[][] createSimpleMergeRSIDData() { + new SimpleMergeRSIDTest(".", "."); + new SimpleMergeRSIDTest(".", ".", "."); + new SimpleMergeRSIDTest("rs1", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs1", "rs1"); + new SimpleMergeRSIDTest(".", "rs1", "rs1"); + new SimpleMergeRSIDTest("rs1", ".", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates + new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1"); + new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1"); + new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1"); + new SimpleMergeRSIDTest("rs1", ".", ".", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs3", "rs1,rs2,rs3"); + + return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") + public void testRSIDMerge(SimpleMergeRSIDTest cfg) { + VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); + final List inputs = new ArrayList(); + + for ( final String id : cfg.inputs ) { + inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); + } + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + inputs, null, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + Assert.assertEquals(merged.getID(), cfg.expected); + } + + // -------------------------------------------------------------------------------- + // + // Test filtered merging + // + // -------------------------------------------------------------------------------- + + private class MergeFilteredTest extends TestDataProvider { + List inputs; + VariantContext expected; + String setExpected; + GATKVariantContextUtils.FilteredRecordMergeType type; + + + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { + this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); + } + + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { + super(MergeFilteredTest.class, name); + LinkedList all = new LinkedList(Arrays.asList(input1, input2)); + this.expected = expected; + this.type = type; + inputs = all; + this.setExpected = setExpected; + } + + public String toString() { + return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); + } + } + + @DataProvider(name = "mergeFiltered") + public Object[][] mergeFilteredData() { + new MergeFilteredTest("AllPass", + makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("noFilters", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "."), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("oneFiltered", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "."), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("onePassOneFail", + makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("AllFiltered", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "FAIL"), + GATKVariantContextUtils.MERGE_FILTER_IN_ALL); + + // test ALL vs. ANY + new MergeFilteredTest("FailOneUnfiltered", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "."), + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("OneFailAllUnfilteredArg", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "FAIL"), + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + // test excluding allele in filtered record + new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "."), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + // promotion of site from unfiltered to PASSES + new MergeFilteredTest("UnfilteredPlusPassIsPass", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("RefInAll", + makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_REF_IN_ALL); + + new MergeFilteredTest("RefInOne", + makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + "2"); + + return MergeFilteredTest.getTests(MergeFilteredTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") + public void testMergeFiltered(MergeFilteredTest cfg) { + final List priority = vcs2priority(cfg.inputs); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + + // test alleles are equal + Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); + + // test set field + Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); + + // test filter field + Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); + } + + // -------------------------------------------------------------------------------- + // + // Test genotype merging + // + // -------------------------------------------------------------------------------- + + private class MergeGenotypesTest extends TestDataProvider { + List inputs; + VariantContext expected; + List priority; + + private MergeGenotypesTest(String name, String priority, VariantContext... arg) { + super(MergeGenotypesTest.class, name); + LinkedList all = new LinkedList(Arrays.asList(arg)); + this.expected = all.pollLast(); + inputs = all; + this.priority = Arrays.asList(priority.split(",")); + } + + public String toString() { + return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); + } + } + + @DataProvider(name = "mergeGenotypes") + public Object[][] mergeGenotypesData() { + new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); + + new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); + + new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); + + new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); + + new MergeGenotypesTest("PreserveNoCall", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); + + new MergeGenotypesTest("PerserveAlleles", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), + makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); + + new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); + + new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); + + // + // merging genothpes with PLs + // + + // first, do no harm + new MergeGenotypesTest("OrderedPLs", "1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles", "1", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); + + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); + + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", + makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); + + new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); + + return MergeGenotypesTest.getTests(MergeGenotypesTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") + public void testMergeGenotypes(MergeGenotypesTest cfg) { + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + + // test alleles are equal + Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); + + // test genotypes + assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); + } + + // necessary to not overload equals for genotypes + private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { + if (actual == expected) { + return; + } + + if (actual == null || expected == null) { + Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual); + } + + if (actual.size() != expected.size()) { + Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); + } + + for (Genotype value : actual) { + Genotype expectedValue = expected.get(value.getSampleName()); + + Assert.assertEquals(value.getAlleles(), expectedValue.getAlleles(), "Alleles in Genotype aren't equal"); + Assert.assertEquals(value.getGQ(), expectedValue.getGQ(), "GQ values aren't equal"); + Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); + if ( value.hasLikelihoods() ) + Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); + } + } + + @Test(enabled = !DEBUG) + public void testMergeGenotypesUniquify() { + final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); + final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + + // test genotypes + Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); + } + +// TODO: remove after testing +// @Test(expectedExceptions = IllegalStateException.class) +// public void testMergeGenotypesRequireUnique() { +// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); +// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); +// +// final VariantContext merged = VariantContextUtils.simpleMerge( +// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); +// } + + // -------------------------------------------------------------------------------- + // + // Misc. tests + // + // -------------------------------------------------------------------------------- + + @Test(enabled = !DEBUG) + public void testAnnotationSet() { + for ( final boolean annotate : Arrays.asList(true, false)) { + for ( final String set : Arrays.asList("set", "combine", "x")) { + final List priority = Arrays.asList("1", "2"); + VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); + VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + + if ( annotate ) + Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); + else + Assert.assertFalse(merged.hasAttribute(set)); + } + } + } + + private static final List vcs2priority(final Collection vcs) { + final List priority = new ArrayList<>(); + + for ( final VariantContext vc : vcs ) { + priority.add(vc.getSource()); + } + + return priority; + } + + // -------------------------------------------------------------------------------- + // + // basic allele clipping test + // + // -------------------------------------------------------------------------------- + + private class ReverseClippingPositionTestProvider extends TestDataProvider { + final String ref; + final List alleles = new ArrayList(); + final int expectedClip; + + private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { + super(ReverseClippingPositionTestProvider.class); + this.ref = ref; + for ( final String allele : alleles ) + this.alleles.add(Allele.create(allele)); + this.expectedClip = expectedClip; + } + + @Override + public String toString() { + return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); + } + } + + @DataProvider(name = "ReverseClippingPositionTestProvider") + public Object[][] makeReverseClippingPositionTestProvider() { + // pair clipping + new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); + new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); + new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele + + // triplets + new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go + new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go + + return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); + } + + @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") + public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { + int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); + Assert.assertEquals(result, cfg.expectedClip); + } + + + // -------------------------------------------------------------------------------- + // + // test splitting into bi-allelics + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "SplitBiallelics") + public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); + + // biallelic -> biallelic + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + // monos -> monos + root.alleles(Arrays.asList(Aref)); + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + root.alleles(Arrays.asList(Aref, C, T)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make())}); + + root.alleles(Arrays.asList(Aref, C, T, G)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make(), + root.alleles(Arrays.asList(Aref, G)).make())}); + + final Allele C = Allele.create("C"); + final Allele CA = Allele.create("CA"); + final Allele CAA = Allele.create("CAA"); + final Allele CAAAA = Allele.create("CAAAA"); + final Allele CAAAAA = Allele.create("CAAAAA"); + final Allele Cref = Allele.create("C", true); + final Allele CAref = Allele.create("CA", true); + final Allele CAAref = Allele.create("CAA", true); + final Allele CAAAref = Allele.create("CAAA", true); + + root.alleles(Arrays.asList(Cref, CA, CAA)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CA)).make(), + root.alleles(Arrays.asList(Cref, CAA)).make())}); + + root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAref, C)).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAAref, C)).make(), + root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), + root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make(), + root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + + final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); + final Allele twoCopies = Allele.create("GTTTTATTTTA", true); + final Allele zeroCopies = Allele.create("G", false); + final Allele oneCopies = Allele.create("GTTTTA", false); + tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), + Arrays.asList( + root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), + root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") + public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + final VariantContext expected = expectedBiallelics.get(i); + assertVariantContextsAreEqual(actual, expected); + } + } + + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") + public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List genotypes = new ArrayList(); + + int sampleI = 0; + for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { + genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); + } + genotypes.add(GenotypeBuilder.createMissing("missing", 2)); + + final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); + + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples + + for ( final Genotype inputGenotype : genotypes ) { + final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); + Assert.assertNotNull(actualGenotype); + if ( ! vc.isVariant() || vc.isBiallelic() ) + Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); + else + Assert.assertTrue(actualGenotype.isNoCall()); + } + } + } + + // -------------------------------------------------------------------------------- + // + // Test repeats + // + // -------------------------------------------------------------------------------- + + private class RepeatDetectorTest extends TestDataProvider { + String ref; + boolean isTrueRepeat; + VariantContext vc; + + private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { + super(RepeatDetectorTest.class); + this.isTrueRepeat = isTrueRepeat; + this.ref = ref; + + List alleles = new LinkedList(); + final Allele refAllele = Allele.create(refAlleleString, true); + alleles.add(refAllele); + for ( final String altString: altAlleleStrings) { + final Allele alt = Allele.create(altString, false); + alleles.add(alt); + } + + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); + this.vc = builder.make(); + } + + public String toString() { + return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); + } + } + + @DataProvider(name = "RepeatDetectorTest") + public Object[][] makeRepeatDetectorTest() { + new RepeatDetectorTest(true, "NAAC", "N", "NA"); + new RepeatDetectorTest(true, "NAAC", "NA", "N"); + new RepeatDetectorTest(false, "NAAC", "NAA", "N"); + new RepeatDetectorTest(false, "NAAC", "N", "NC"); + new RepeatDetectorTest(false, "AAC", "A", "C"); + + // running out of ref bases => false + new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); + + // complex repeats + new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); + + // multi-allelic + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false + + return RepeatDetectorTest.getTests(RepeatDetectorTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") + public void testRepeatDetectorTest(RepeatDetectorTest cfg) { + + // test alleles are equal + Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); + } + + @Test(enabled = !DEBUG) + public void testRepeatAllele() { + Allele nullR = Allele.create("A", true); + Allele nullA = Allele.create("A", false); + Allele atc = Allele.create("AATC", false); + Allele atcatc = Allele.create("AATCATC", false); + Allele ccccR = Allele.create("ACCCC", true); + Allele cc = Allele.create("ACC", false); + Allele cccccc = Allele.create("ACCCCCC", false); + Allele gagaR = Allele.create("AGAGA", true); + Allele gagagaga = Allele.create("AGAGAGAGA", false); + + // - / ATC [ref] from 20-22 + String delLoc = "chr1"; + int delLocStart = 20; + int delLocStop = 22; + + // - [ref] / ATC from 20-20 + String insLoc = "chr1"; + int insLocStart = 20; + int insLocStop = 20; + + Pair,byte[]> result; + byte[] refBytes = "TATCATCATCGGA".getBytes(); + + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); + + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); + + + // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 + VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,3); + + // ATC*,A,ATCATC + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],2); + Assert.assertEquals(result.getFirst().toArray()[2],4); + Assert.assertEquals(result.getSecond().length,3); + + // simple non-tandem deletion: CCCC*, - + refBytes = "TCCCCCCCCATG".getBytes(); + vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],8); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,1); + + // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 + refBytes = "TCCCCCCCAGAGAGAG".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],7); + Assert.assertEquals(result.getFirst().toArray()[1],5); + Assert.assertEquals(result.getFirst().toArray()[2],3); + Assert.assertEquals(result.getFirst().toArray()[3],9); + Assert.assertEquals(result.getSecond().length,1); + + // GAGA*,-,GAGAGAGA + refBytes = "TGAGAGAGAGATTT".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],5); + Assert.assertEquals(result.getFirst().toArray()[1],3); + Assert.assertEquals(result.getFirst().toArray()[2],7); + Assert.assertEquals(result.getSecond().length,2); + + } + + // -------------------------------------------------------------------------------- + // + // test forward clipping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ForwardClippingData") + public Object[][] makeForwardClippingData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList("A"), -1}); + tests.add(new Object[]{Arrays.asList(""), -1}); + tests.add(new Object[]{Arrays.asList("A", "C"), -1}); + tests.add(new Object[]{Arrays.asList("AC", "C"), -1}); + tests.add(new Object[]{Arrays.asList("A", "G"), -1}); + tests.add(new Object[]{Arrays.asList("A", "T"), -1}); + tests.add(new Object[]{Arrays.asList("GT", "CA"), -1}); + tests.add(new Object[]{Arrays.asList("GT", "CT"), -1}); + tests.add(new Object[]{Arrays.asList("ACC", "AC"), 0}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), 2}); + tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), 0}); + tests.add(new Object[]{Arrays.asList("A", ""), -1}); + for ( int len = 0; len < 50; len++ ) + tests.add(new Object[]{Arrays.asList("A" + new String(Utils.dupBytes((byte)'C', len)), "C"), -1}); + + tests.add(new Object[]{Arrays.asList("A", "T", "C"), -1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), 0}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "A"), -1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("AC", "AC", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), 1}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), 1}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") + public void testForwardClipping(final List alleleStrings, final int expectedClip) { + final List alleles = new LinkedList(); + for ( final String alleleString : alleleStrings ) + alleles.add(Allele.create(alleleString)); + + for ( final List myAlleles : Utils.makePermutations(alleles, alleles.size(), false)) { + final int actual = GATKVariantContextUtils.computeForwardClipping(myAlleles); + Assert.assertEquals(actual, expectedClip); + } + } + + @DataProvider(name = "ClipAlleleTest") + public Object[][] makeClipAlleleTest() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList("ACC", "AC"), Arrays.asList("AC", "A"), 0}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), Arrays.asList("GC", "G"), 2}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), Arrays.asList("C", "A"), 3}); + tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), Arrays.asList("AC", "A"), 0}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), Arrays.asList("T", "C", "G"), 1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), Arrays.asList("T", "C", "CG"), 1}); + tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), Arrays.asList("C", "CT", "CG"), 1}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), Arrays.asList("G", "GT", "GTA"), 2}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), Arrays.asList("G", "GT", "GCA"), 2}); + + // trims from left and right + tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); + tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); + tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") + public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { + final int start = 10; + final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); + final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true); + + Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped); + for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) { + final Allele trimmed = clipped.getAlleles().get(i); + Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); + } + } + + // -------------------------------------------------------------------------------- + // + // test primitive allele splitting + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "PrimitiveAlleleSplittingData") + public Object[][] makePrimitiveAlleleSplittingData() { + List tests = new ArrayList<>(); + + // no split + tests.add(new Object[]{"A", "C", 0, null}); + tests.add(new Object[]{"A", "AC", 0, null}); + tests.add(new Object[]{"AC", "A", 0, null}); + + // one split + tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); + tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); + tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); + + // two splits + tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); + tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); + tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); + + // three splits + tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") + public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { + + final int start = 10; + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + + final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); + + if ( expectedSplit > 0 ) { + Assert.assertEquals(result.size(), expectedSplit); + for ( int i = 0; i < variantPositions.size(); i++ ) { + Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); + } + } else { + Assert.assertEquals(result.size(), 1); + Assert.assertEquals(vc, result.get(0)); + } + } + + // -------------------------------------------------------------------------------- + // + // test allele remapping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "AlleleRemappingData") + public Object[][] makeAlleleRemappingData() { + List tests = new ArrayList<>(); + + final Allele originalBase1 = Allele.create((byte)'A'); + final Allele originalBase2 = Allele.create((byte)'T'); + + for ( final byte base1 : BaseUtils.BASES ) { + for ( final byte base2 : BaseUtils.BASES ) { + for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { + Map map = new HashMap<>(2); + map.put(originalBase1, Allele.create(base1)); + map.put(originalBase2, Allele.create(base2)); + + tests.add(new Object[]{map, numGenotypes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") + public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { + + final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); + + final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); + + final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); + + for ( int i = 0; i < numGenotypes; i++ ) { + final Genotype originalG = originalGC.get(String.format("%d", i)); + final Genotype remappedG = remappedGC.get(String.format("%d", i)); + + Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); + for ( int j = 0; j < originalG.getAlleles().size(); j++ ) + Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); + } + } + + private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { + GenomeAnalysisEngine.resetRandomGenerator(); + final Random random = GenomeAnalysisEngine.getRandomGenerator(); + + final GenotypesContext gc = GenotypesContext.create(); + for ( int i = 0; i < numGenotypes; i++ ) { + // choose alleles at random + final List myAlleles = new ArrayList(); + myAlleles.add(alleles.get(random.nextInt(2))); + myAlleles.add(alleles.get(random.nextInt(2))); + + final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); + gc.add(g); + } + + return gc; + } + + // -------------------------------------------------------------------------------- + // + // Test subsetDiploidAlleles + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "subsetDiploidAllelesData") + public Object[][] makesubsetDiploidAllelesData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); + + // make sure we don't screw up the simple case + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test case + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).PL(uninformative).GQ(0).make(); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homRef3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).GQ(200).make())}); + + // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetCG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).GQ(200).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).GQ(200).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "subsetDiploidAllelesData") + public void testsubsetDiploidAllelesData(final VariantContext inputVC, + final List allelesToUse, + final List expectedGenotypes) { + final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + @DataProvider(name = "UpdateGenotypeAfterSubsettingData") + public Object[][] makeUpdateGenotypeAfterSubsettingData() { + List tests = new ArrayList(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); + + final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; + final double[] hetPL = new double[]{0.09, 0.9, 0.01}; + final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; + final double[] uninformative = new double[]{0.33, 0.33, 0.33}; + final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); + + for ( final List alleles : allSubsetAlleles ) { + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + } + + for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); +// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") + public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, + final double[] likelihoods, + final List originalGT, + final List allelesToUse, + final List expectedAlleles) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); + GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); + final Genotype g = gb.make(); + Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); + } + + @Test(enabled = !DEBUG) + public void testSubsetToRef() { + final Map tests = new LinkedHashMap<>(); + + for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { + for ( final String name : Arrays.asList("test1", "test2") ) { + final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); + builder.DP(10); + builder.GQ(30); + builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); + builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); + final List refs = Collections.nCopies(alleles.size(), Aref); + tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); + } + } + + for ( final int n : Arrays.asList(1, 2, 3) ) { + for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { + final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); + final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); + + Assert.assertEquals(gc.size(), genotypes.size()); + for ( int i = 0; i < genotypes.size(); i++ ) { +// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); + assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); + } + } + } + } + + // -------------------------------------------------------------------------------- + // + // Test updatePLsAndAD + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "updatePLsAndADData") + public Object[][] makeUpdatePLsAndADData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); + + // make sure we don't screw up the simple case where no selection happens + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test cases + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + + final int[] homRef3AllelesAD = new int[]{20, 0, 1}; + final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; + final int[] homC3AllelesAD = new int[]{0, 20, 1}; + final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; + final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG + final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "updatePLsAndADData") + public void testUpdatePLsAndADData(final VariantContext originalVC, + final VariantContext selectedVC, + final List expectedGenotypes) { + final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); + final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + // -------------------------------------------------------------------------------- + // + // Test methods for merging reference confidence VCs + // + // -------------------------------------------------------------------------------- + + + @Test(dataProvider = "indexOfAlleleData") + public void testIndexOfAllele(final Allele reference, final List altAlleles, final List otherAlleles) { + final List alleles = new ArrayList<>(altAlleles.size() + 1); + alleles.add(reference); + alleles.addAll(altAlleles); + final VariantContext vc = makeVC("Source", alleles); + + for (int i = 0; i < alleles.size(); i++) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,false),-1); + if (i == 0) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),true),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),false),false,true,true),-1); + } else { + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),true),i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),false), i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),true),i-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),false),-1); + } + } + + for (final Allele other : otherAlleles) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, true, true, true), -1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, false, false, false),-1); + } + } + + @DataProvider(name = "indexOfAlleleData") + public Iterator indexOfAlleleData() { + + final Allele[] ALTERNATIVE_ALLELES = new Allele[] { T, C, G, ATC, ATCATC}; + + final int lastMask = 0x1F; + + return new Iterator() { + + int nextMask = 0; + + @Override + public boolean hasNext() { + return nextMask <= lastMask; + } + + @Override + public Object[] next() { + + int mask = nextMask++; + final List includedAlleles = new ArrayList<>(5); + final List excludedAlleles = new ArrayList<>(5); + for (int i = 0; i < ALTERNATIVE_ALLELES.length; i++) { + ((mask & 1) == 1 ? includedAlleles : excludedAlleles).add(ALTERNATIVE_ALLELES[i]); + mask >>= 1; + } + return new Object[] { Aref , includedAlleles, excludedAlleles}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + @Test(dataProvider = "generatePLsData") + public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { + + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); + final int[] PLs = new int[numLikelihoods]; + for ( int i = 0; i < numLikelihoods; i++ ) + PLs[i] = i; + + final List alleles = new ArrayList<>(numOriginalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < numOriginalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); + + final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); + + Assert.assertEquals(newPLs.length, numLikelihoods); + + final int[] expectedPLs = new int[numLikelihoods]; + for ( int i = 0; i < numOriginalAlleles; i++ ) { + for ( int j = i; j < numOriginalAlleles; j++ ) { + final int index = GenotypeLikelihoods.calculatePLindex(i, j); + final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); + expectedPLs[index] = value; + } + } + + for ( int i = 0; i < numLikelihoods; i++ ) { + Assert.assertEquals(newPLs[i], expectedPLs[i]); + } + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { + final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } + + @Test + public void testGenerateADWithNewAlleles() { + + final int[] originalAD = new int[] {1,2,0}; + final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; + + final int[] newAD = GATKVariantContextUtils.generateAD(originalAD, indexesOfRelevantAlleles); + Assert.assertEquals(newAD, new int[]{1,2,0,0}); + } + + + @Test(expectedExceptions = UserException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List noCalls = new ArrayList<>(2); + noCalls.add(Allele.NO_CALL); + noCalls.add(Allele.NO_CALL); + + final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + + // first test the case of a single record + tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); + + // combination of all + tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), + + loc, false, + null}); + tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), + loc, true, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); + + final Object[][] result = tests.toArray(new Object[][]{}); + return result; + } + + @DataProvider(name = "generatePLsData") + public Object[][] makeGeneratePLsData() { + final List tests = new ArrayList<>(); + + for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { + for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { + for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { + final int[] indexes = new int[originalAlleles]; + for ( int i = 0; i < originalAlleles; i++ ) + indexes[i] = i; + indexes[swapPosition1] = swapPosition2; + indexes[swapPosition2] = swapPosition1; + tests.add(new Object[]{originalAlleles, indexes}); + } + } + } + return tests.toArray(new Object[][]{}); + } +} + diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java new file mode 100644 index 000000000..bb794dca4 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -0,0 +1,377 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFCodec; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Caliper microbenchmark of parsing a VCF file + */ +public class VariantContextBenchmark extends SimpleBenchmark { + @Param({"/Users/depristo/Desktop/broadLocal/localData/ALL.chr20.merged_beagle_mach.20101123.snps_indels_svs.genotypes.vcf"}) + String vcfFile; + + @Param({"1000"}) + int linesToRead; // set automatically by framework + + @Param({"100"}) + int nSamplesToTake; // set automatically by framework + + @Param({"10"}) + int dupsToMerge; // set automatically by framework + + @Param + Operation operation; // set automatically by framework + + private String INPUT_STRING; + + public enum Operation { + READ, + SUBSET_TO_SAMPLES, + GET_TYPE, + GET_ID, + GET_GENOTYPES, + GET_ATTRIBUTE_STRING, + GET_ATTRIBUTE_INT, + GET_N_SAMPLES, + GET_GENOTYPES_FOR_SAMPLES, + GET_GENOTYPES_IN_ORDER_OF_NAME, + CALC_GENOTYPE_COUNTS, + MERGE + } + + @Override protected void setUp() { + // TODO -- update for new tribble interface +// try { +// ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference)); +// b37GenomeLocParser = new GenomeLocParser(seq); +// } catch ( FileNotFoundException e) { +// throw new RuntimeException(e); +// } +// +// // read it into a String so that we don't try to benchmark IO issues +// try { +// FileInputStream s = new FileInputStream(new File(vcfFile)); +// AsciiLineReader lineReader = new AsciiLineReader(s); +// int counter = 0; +// StringBuffer sb = new StringBuffer(); +// while (counter++ < linesToRead ) { +// String line = lineReader.readLine(); +// if ( line == null ) +// break; +// sb.append(line + "\n"); +// } +// s.close(); +// INPUT_STRING = sb.toString(); +// } catch (IOException e) { +// throw new RuntimeException(e); +// } + } + + private interface FunctionToBenchmark { + public void run(T vc); + } + + private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { + // TODO -- update for new Tribble interface +// try { +// InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes()); +// AsciiLineReader lineReader = new AsciiLineReader(is); +// codec.readHeader(lineReader); +// +// int counter = 0; +// while (counter++ < linesToRead ) { +// String line = lineReader.readLine(); +// if ( line == null ) +// break; +// +// T vc = codec.decode(line); +// func.run(vc); +// } +// } catch (Exception e) { +// System.out.println("Benchmarking run failure because of " + e.getMessage()); +// } + } + + public void timeV14(int rep) { + for ( int i = 0; i < rep; i++ ) { + FunctionToBenchmark func = getV14FunctionToBenchmark(); + final VCFCodec codec = new VCFCodec(); + runBenchmark(codec, func); + } + } + + public FunctionToBenchmark getV14FunctionToBenchmark() { + switch ( operation ) { + case READ: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // empty operation + } + }; + case SUBSET_TO_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); + VariantContext sub = vc.subContextFromSamples(samples); + sub.getNSamples(); + } + }; + case GET_TYPE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getType(); + } + }; + case GET_ID: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getID(); + } + }; + case GET_GENOTYPES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getGenotypes().size(); + } + }; + + case GET_GENOTYPES_FOR_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); + vc.getGenotypes(samples).size(); + } + }; + + case GET_ATTRIBUTE_STRING: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttribute("AN", null); + } + }; + + case GET_ATTRIBUTE_INT: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttributeAsInt("AC", 0); + } + }; + + case GET_N_SAMPLES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getNSamples(); + } + }; + + case GET_GENOTYPES_IN_ORDER_OF_NAME: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // TODO - TEST IS BROKEN +// int n = 0; +// for ( final Genotype g: vc.getGenotypesOrderedByName() ) n++; + } + }; + + case CALC_GENOTYPE_COUNTS: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getHetCount(); + } + }; + + case MERGE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + List toMerge = new ArrayList<>(); + + for ( int i = 0; i < dupsToMerge; i++ ) { + GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + gc.add(new GenotypeBuilder(g).name(g.getSampleName()+"_"+i).make()); + } + toMerge.add(new VariantContextBuilder(vc).genotypes(gc).make()); + } + + GATKVariantContextUtils.simpleMerge(toMerge, null, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, + true, false, "set", false, true); + } + }; + + default: throw new IllegalArgumentException("Unexpected operation " + operation); + } + } + + // -------------------------------------------------------------------------------- + // + // V13 + // + // In order to use this, you must move the v13 version from archive and uncomment + // + // git mv private/archive/java/src/org/broadinstitute/sting/utils/variantcontext/v13 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13 + // + // -------------------------------------------------------------------------------- + +// public void timeV13(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// FunctionToBenchmark func = getV13FunctionToBenchmark(); +// FeatureCodec codec = new org.broadinstitute.variant.variantcontext.v13.VCFCodec(); +// runBenchmark(codec, func); +// } +// } +// +// public FunctionToBenchmark getV13FunctionToBenchmark() { +// switch ( operation ) { +// case READ: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// ; // empty operation +// } +// }; +// case SUBSET_TO_SAMPLES: +// return new FunctionToBenchmark() { +// List samples; +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); +// org.broadinstitute.variant.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); +// sub.getNSamples(); +// } +// }; +// +// case GET_TYPE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getType(); +// } +// }; +// case GET_ID: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getID(); +// } +// }; +// case GET_GENOTYPES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getGenotypes().size(); +// } +// }; +// +// case GET_GENOTYPES_FOR_SAMPLES: +// return new FunctionToBenchmark() { +// Set samples; +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); +// vc.getGenotypes(samples).size(); +// } +// }; +// +// case GET_ATTRIBUTE_STRING: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getExtendedAttribute("AN", null); +// } +// }; +// +// case GET_ATTRIBUTE_INT: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getAttributeAsInt("AC", 0); +// } +// }; +// +// case GET_N_SAMPLES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getNSamples(); +// } +// }; +// +// case GET_GENOTYPES_IN_ORDER_OF_NAME: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// ; // TODO - TEST IS BROKEN +// //vc.getGenotypesOrderedByName(); +// } +// }; +// +// case CALC_GENOTYPE_COUNTS: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// vc.getHetCount(); +// } +// }; +// +// case MERGE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { +// List toMerge = new ArrayList(); +// +// for ( int i = 0; i < dupsToMerge; i++ ) { +// Map gc = new HashMap(); +// for ( final org.broadinstitute.variant.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { +// String name = g.getSampleName()+"_"+i; +// gc.put(name, new org.broadinstitute.variant.variantcontext.v13.Genotype(name, +// g.getAlleles(), g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); +// toMerge.add(org.broadinstitute.variant.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); +// } +// } +// +// org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, +// toMerge, null, +// org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, +// true, false, "set", false, true, false); +// } +// }; +// +// default: throw new IllegalArgumentException("Unexpected operation " + operation); +// } +// } + + public static void main(String[] args) { + com.google.caliper.Runner.main(VariantContextBenchmark.class, args); + } +} diff --git a/public/testdata/exampleBAM.bam b/public/gatk-framework/src/test/resources/exampleBAM.bam similarity index 100% rename from public/testdata/exampleBAM.bam rename to public/gatk-framework/src/test/resources/exampleBAM.bam diff --git a/public/testdata/exampleBAM.bam.bai b/public/gatk-framework/src/test/resources/exampleBAM.bam.bai similarity index 100% rename from public/testdata/exampleBAM.bam.bai rename to public/gatk-framework/src/test/resources/exampleBAM.bam.bai diff --git a/public/testdata/exampleBAM.simple.bai b/public/gatk-framework/src/test/resources/exampleBAM.simple.bai similarity index 100% rename from public/testdata/exampleBAM.simple.bai rename to public/gatk-framework/src/test/resources/exampleBAM.simple.bai diff --git a/public/testdata/exampleBAM.simple.bam b/public/gatk-framework/src/test/resources/exampleBAM.simple.bam similarity index 100% rename from public/testdata/exampleBAM.simple.bam rename to public/gatk-framework/src/test/resources/exampleBAM.simple.bam diff --git a/public/testdata/exampleDBSNP.vcf b/public/gatk-framework/src/test/resources/exampleDBSNP.vcf similarity index 100% rename from public/testdata/exampleDBSNP.vcf rename to public/gatk-framework/src/test/resources/exampleDBSNP.vcf diff --git a/public/testdata/exampleDBSNP.vcf.idx b/public/gatk-framework/src/test/resources/exampleDBSNP.vcf.idx similarity index 100% rename from public/testdata/exampleDBSNP.vcf.idx rename to public/gatk-framework/src/test/resources/exampleDBSNP.vcf.idx diff --git a/public/testdata/exampleFASTA-3contigs.fasta b/public/gatk-framework/src/test/resources/exampleFASTA-3contigs.fasta similarity index 100% rename from public/testdata/exampleFASTA-3contigs.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA-3contigs.fasta diff --git a/public/testdata/exampleFASTA-combined.fasta b/public/gatk-framework/src/test/resources/exampleFASTA-combined.fasta similarity index 100% rename from public/testdata/exampleFASTA-combined.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA-combined.fasta diff --git a/public/testdata/exampleFASTA-windows.fasta b/public/gatk-framework/src/test/resources/exampleFASTA-windows.fasta similarity index 100% rename from public/testdata/exampleFASTA-windows.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA-windows.fasta diff --git a/public/testdata/exampleFASTA.dict b/public/gatk-framework/src/test/resources/exampleFASTA.dict similarity index 100% rename from public/testdata/exampleFASTA.dict rename to public/gatk-framework/src/test/resources/exampleFASTA.dict diff --git a/public/testdata/exampleFASTA.fasta b/public/gatk-framework/src/test/resources/exampleFASTA.fasta similarity index 100% rename from public/testdata/exampleFASTA.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta diff --git a/public/testdata/exampleFASTA.fasta.amb b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.amb similarity index 100% rename from public/testdata/exampleFASTA.fasta.amb rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.amb diff --git a/public/testdata/exampleFASTA.fasta.ann b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.ann similarity index 100% rename from public/testdata/exampleFASTA.fasta.ann rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.ann diff --git a/public/testdata/exampleFASTA.fasta.bwt b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.bwt similarity index 100% rename from public/testdata/exampleFASTA.fasta.bwt rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.bwt diff --git a/public/testdata/exampleFASTA.fasta.fai b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.fai similarity index 100% rename from public/testdata/exampleFASTA.fasta.fai rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.fai diff --git a/public/testdata/exampleFASTA.fasta.pac b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.pac similarity index 100% rename from public/testdata/exampleFASTA.fasta.pac rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.pac diff --git a/public/testdata/exampleFASTA.fasta.rbwt b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.rbwt similarity index 100% rename from public/testdata/exampleFASTA.fasta.rbwt rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.rbwt diff --git a/public/testdata/exampleFASTA.fasta.rpac b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.rpac similarity index 100% rename from public/testdata/exampleFASTA.fasta.rpac rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.rpac diff --git a/public/testdata/exampleFASTA.fasta.rsa b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.rsa similarity index 100% rename from public/testdata/exampleFASTA.fasta.rsa rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.rsa diff --git a/public/testdata/exampleFASTA.fasta.sa b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.sa similarity index 100% rename from public/testdata/exampleFASTA.fasta.sa rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.sa diff --git a/public/testdata/exampleGATKReport.eval b/public/gatk-framework/src/test/resources/exampleGATKReport.eval similarity index 100% rename from public/testdata/exampleGATKReport.eval rename to public/gatk-framework/src/test/resources/exampleGATKReport.eval diff --git a/public/testdata/exampleGATKReportv1.tbl b/public/gatk-framework/src/test/resources/exampleGATKReportv1.tbl similarity index 100% rename from public/testdata/exampleGATKReportv1.tbl rename to public/gatk-framework/src/test/resources/exampleGATKReportv1.tbl diff --git a/public/testdata/exampleGATKReportv2.tbl b/public/gatk-framework/src/test/resources/exampleGATKReportv2.tbl similarity index 100% rename from public/testdata/exampleGATKReportv2.tbl rename to public/gatk-framework/src/test/resources/exampleGATKReportv2.tbl diff --git a/public/testdata/exampleGRP.grp b/public/gatk-framework/src/test/resources/exampleGRP.grp similarity index 100% rename from public/testdata/exampleGRP.grp rename to public/gatk-framework/src/test/resources/exampleGRP.grp diff --git a/public/testdata/exampleINTERVAL.intervals b/public/gatk-framework/src/test/resources/exampleINTERVAL.intervals similarity index 100% rename from public/testdata/exampleINTERVAL.intervals rename to public/gatk-framework/src/test/resources/exampleINTERVAL.intervals diff --git a/public/testdata/exampleNORG.bam b/public/gatk-framework/src/test/resources/exampleNORG.bam similarity index 100% rename from public/testdata/exampleNORG.bam rename to public/gatk-framework/src/test/resources/exampleNORG.bam diff --git a/public/testdata/exampleNORG.bam.bai b/public/gatk-framework/src/test/resources/exampleNORG.bam.bai similarity index 100% rename from public/testdata/exampleNORG.bam.bai rename to public/gatk-framework/src/test/resources/exampleNORG.bam.bai diff --git a/public/gatk-framework/src/test/resources/forSimulation.vcf b/public/gatk-framework/src/test/resources/forSimulation.vcf new file mode 100644 index 000000000..a0c57c2c0 --- /dev/null +++ b/public/gatk-framework/src/test/resources/forSimulation.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +20 10000000 . T C . . . GT 0/1 0/0 1/1 +20 10001000 . GG AA . . . GT 0/1 0/0 1/1 +20 10002000 . TAGTA T . . . GT 0/1 0/0 1/1 +20 10003000 . A AGCT . . . GT 0/1 0/0 1/1 +20 10004000 . GAT G,GATAT . . . GT 0/1 0/0 1/1 diff --git a/public/gatk-framework/src/test/resources/forSimulation.vcf.idx b/public/gatk-framework/src/test/resources/forSimulation.vcf.idx new file mode 100644 index 000000000..4f734b7af Binary files /dev/null and b/public/gatk-framework/src/test/resources/forSimulation.vcf.idx differ diff --git a/public/gatk-framework/src/test/resources/testProperties.properties b/public/gatk-framework/src/test/resources/testProperties.properties new file mode 100644 index 000000000..e422d6eb1 --- /dev/null +++ b/public/gatk-framework/src/test/resources/testProperties.properties @@ -0,0 +1,2 @@ +foo=bar +version=1.0 diff --git a/public/testdata/testfile.sam b/public/gatk-framework/src/test/resources/testfile.sam similarity index 100% rename from public/testdata/testfile.sam rename to public/gatk-framework/src/test/resources/testfile.sam diff --git a/public/gatk-package/pom.xml b/public/gatk-package/pom.xml new file mode 100644 index 000000000..877db5dc9 --- /dev/null +++ b/public/gatk-package/pom.xml @@ -0,0 +1,286 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + gatk-package + jar + GATK Package + + + ${project.basedir}/../.. + prepare-package + package + org.broadinstitute.sting.gatk.CommandLineGATK + GenomeAnalysisTK + + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + + org.broad + tribble + + + + org.broadinstitute + variant + + + + commons-logging + commons-logging + + + + ${project.groupId} + gatk-framework + ${project.version} + example-resources + tar.bz2 + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + unit-tests + + ${sting.serialunittests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + integration-tests + + ${sting.serialintegrationtests.skipped} + + org.broadinstitute.sting:.* + + + + + pipeline-tests + + ${sting.serialpipelinetests.skipped} + + org.broadinstitute.sting:.* + + + + + large-scale-tests + + ${sting.seriallargescaletests.skipped} + + org.broadinstitute.sting:.* + + + + + knowledge-base-tests + + ${sting.serialknowledgebasetests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-direct-dependencies + ${sting.unpack.phase} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + sting-executable + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + binary-dist + ${sting.shade.phase} + + + + + + com.pyx4j + maven-junction-plugin + + + link-binary-jar + ${sting.shade.phase} + + + link-git-release + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-install-plugin + + + default-install + none + + + install-package + install + + + + + + + + + + protected + + + ${basedir}/../../protected/gatk-protected/pom.xml + + + + + ${project.groupId} + gatk-protected + ${project.version} + true + + + ${project.groupId} + gatk-protected + ${project.version} + test-jar + test + true + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + ${project.groupId} + gatk-private + ${project.version} + true + + + ${project.groupId} + gatk-private + ${project.version} + test-jar + test + true + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + none + none + + + + + diff --git a/public/gatk-package/src/main/assembly/binary-dist.xml b/public/gatk-package/src/main/assembly/binary-dist.xml new file mode 100644 index 000000000..adc52646c --- /dev/null +++ b/public/gatk-package/src/main/assembly/binary-dist.xml @@ -0,0 +1,22 @@ + + binary-dist + + tar.bz2 + + false + + + + org.broadinstitute.sting:gatk-package + + ${sting.binary-dist.name}.${artifact.extension} + + + resources + true + + org.broadinstitute.sting:gatk-framework:tar.bz2:example-resources + + + + diff --git a/public/gatk-queue-extgen/pom.xml b/public/gatk-queue-extgen/pom.xml new file mode 100644 index 000000000..99e9b23bc --- /dev/null +++ b/public/gatk-queue-extgen/pom.xml @@ -0,0 +1,29 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + gatk-queue-extgen + jar + Queue GATK ExtGen + Queue GATK Extensions Generator + + + ${project.basedir}/../.. + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + + diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java diff --git a/public/gsalib/pom.xml b/public/gsalib/pom.xml new file mode 100644 index 000000000..a242145c2 --- /dev/null +++ b/public/gsalib/pom.xml @@ -0,0 +1,45 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + gsalib + pom + Sting GSALib + + + ${project.basedir}/../.. + org/broadinstitute/sting/utils/R + gsalib.tar.gz + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + gsalib-assembly + + single + + ${sting.generate-resources.phase} + + false + + src/assembly/gsalib.xml + + + + + + + + diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/DESCRIPTION b/public/gsalib/src/R/DESCRIPTION similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/DESCRIPTION rename to public/gsalib/src/R/DESCRIPTION diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/NAMESPACE b/public/gsalib/src/R/NAMESPACE similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/NAMESPACE rename to public/gsalib/src/R/NAMESPACE diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.error.R b/public/gsalib/src/R/R/gsa.error.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.error.R rename to public/gsalib/src/R/R/gsa.error.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.getargs.R b/public/gsalib/src/R/R/gsa.getargs.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.getargs.R rename to public/gsalib/src/R/R/gsa.getargs.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.message.R b/public/gsalib/src/R/R/gsa.message.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.message.R rename to public/gsalib/src/R/R/gsa.message.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.plot.venn.R b/public/gsalib/src/R/R/gsa.plot.venn.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.plot.venn.R rename to public/gsalib/src/R/R/gsa.plot.venn.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.eval.R b/public/gsalib/src/R/R/gsa.read.eval.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.eval.R rename to public/gsalib/src/R/R/gsa.read.eval.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/gsalib/src/R/R/gsa.read.gatkreport.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R rename to public/gsalib/src/R/R/gsa.read.gatkreport.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.squidmetrics.R b/public/gsalib/src/R/R/gsa.read.squidmetrics.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.squidmetrics.R rename to public/gsalib/src/R/R/gsa.read.squidmetrics.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.vcf.R b/public/gsalib/src/R/R/gsa.read.vcf.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.vcf.R rename to public/gsalib/src/R/R/gsa.read.vcf.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/gsalib/src/R/R/gsa.variantqc.utils.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R rename to public/gsalib/src/R/R/gsa.variantqc.utils.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.warn.R b/public/gsalib/src/R/R/gsa.warn.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.warn.R rename to public/gsalib/src/R/R/gsa.warn.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/Read-and-delete-me b/public/gsalib/src/R/Read-and-delete-me similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/Read-and-delete-me rename to public/gsalib/src/R/Read-and-delete-me diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.error.Rd b/public/gsalib/src/R/man/gsa.error.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.error.Rd rename to public/gsalib/src/R/man/gsa.error.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.getargs.Rd b/public/gsalib/src/R/man/gsa.getargs.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.getargs.Rd rename to public/gsalib/src/R/man/gsa.getargs.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.message.Rd b/public/gsalib/src/R/man/gsa.message.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.message.Rd rename to public/gsalib/src/R/man/gsa.message.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.plot.venn.Rd b/public/gsalib/src/R/man/gsa.plot.venn.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.plot.venn.Rd rename to public/gsalib/src/R/man/gsa.plot.venn.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.eval.Rd b/public/gsalib/src/R/man/gsa.read.eval.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.eval.Rd rename to public/gsalib/src/R/man/gsa.read.eval.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.gatkreport.Rd b/public/gsalib/src/R/man/gsa.read.gatkreport.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.gatkreport.Rd rename to public/gsalib/src/R/man/gsa.read.gatkreport.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.squidmetrics.Rd b/public/gsalib/src/R/man/gsa.read.squidmetrics.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.squidmetrics.Rd rename to public/gsalib/src/R/man/gsa.read.squidmetrics.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.vcf.Rd b/public/gsalib/src/R/man/gsa.read.vcf.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.vcf.Rd rename to public/gsalib/src/R/man/gsa.read.vcf.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.warn.Rd b/public/gsalib/src/R/man/gsa.warn.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.warn.Rd rename to public/gsalib/src/R/man/gsa.warn.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd b/public/gsalib/src/R/man/gsalib-package.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd rename to public/gsalib/src/R/man/gsalib-package.Rd diff --git a/public/gsalib/src/assembly/gsalib.xml b/public/gsalib/src/assembly/gsalib.xml new file mode 100644 index 000000000..7650c713d --- /dev/null +++ b/public/gsalib/src/assembly/gsalib.xml @@ -0,0 +1,13 @@ + + gsalib + + tar.gz + + false + + + gsalib + src/R + + + diff --git a/public/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/java/src/org/broadinstitute/sting/commandline/Argument.java deleted file mode 100644 index fa7ca9cc3..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/Argument.java +++ /dev/null @@ -1,89 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import java.lang.annotation.*; - -/** - * Created by IntelliJ IDEA. - * User: hanna - * Date: Mar 24, 2009 - * Time: 11:11:36 AM - */ -/** - * Annotates fields in objects that should be used as command-line arguments. - * Any field annotated with @Argument can appear as a command-line parameter. - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -public @interface Argument { - /** - * The full name of the command-line argument. Full names should be - * prefixed on the command-line with a double dash (--). - * @return Selected full name, or "" to use the default. - */ - String fullName() default ""; - - /** - * Specified short name of the command. Short names should be prefixed - * with a single dash. Argument values can directly abut single-char - * short names or be separated from them by a space. - * @return Selected short name, or "" for none. - */ - String shortName() default ""; - - /** - * Documentation for the command-line argument. Should appear when the - * --help argument is specified. - * @return Doc string associated with this command-line argument. - */ - String doc() default "Undocumented option"; - - /** - * Is this argument required. If true, the command-line argument system will - * make a best guess for populating this argument based on the type descriptor, - * and will fail if the type can't be populated. - * @return True if the argument is required. False otherwise. - */ - boolean required() default true; - - /** - * Should this command-line argument be exclusive of others. Should be - * a comma-separated list of names of arguments of which this should be - * independent. - * @return A comma-separated string listing other arguments of which this - * argument should be independent. - */ - String exclusiveOf() default ""; - - /** - * Provide a regexp-based validation string. - * @return Non-empty regexp for validation, blank otherwise. - */ - String validation() default ""; -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java deleted file mode 100644 index a70d6e706..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ /dev/null @@ -1,840 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.walkers.Multiplex; -import org.broadinstitute.sting.gatk.walkers.Multiplexer; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.lang.annotation.Annotation; -import java.lang.reflect.*; -import java.util.*; - -/** - * An descriptor capable of providing parsers that can parse any type - * of supported command-line argument. - * - * @author mhanna - * @version 0.1 - */ -public abstract class ArgumentTypeDescriptor { - private static Class[] ARGUMENT_ANNOTATIONS = {Input.class, Output.class, Argument.class}; - - /** - * our log, which we want to capture anything from org.broadinstitute.sting - */ - protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class); - - /** - * Fetch the given descriptor from the descriptor repository. - * @param descriptors the descriptors from which to select a good match. - * @param type Class for which to specify a descriptor. - * @return descriptor for the given type. - */ - public static ArgumentTypeDescriptor selectBest( Collection descriptors, Class type ) { - for( ArgumentTypeDescriptor descriptor: descriptors ) { - if( descriptor.supports(type) ) - return descriptor; - } - throw new ReviewedStingException("Can't process command-line arguments of type: " + type.getName()); - } - - /** - * Does this descriptor support classes of the given type? - * @param type The type to check. - * @return true if this descriptor supports the given type, false otherwise. - */ - public abstract boolean supports( Class type ); - - /** - * Returns false if a type-specific default can be employed. - * @param source Source of the command-line argument. - * @return True to throw in a type specific default. False otherwise. - */ - public boolean createsTypeDefault(ArgumentSource source) { return false; } - - /** - * Returns a documentation-friendly value for the default of a type descriptor. - * Must be overridden if createsTypeDefault return true. cannot be called otherwise - * @param source Source of the command-line argument. - * @return Friendly string of the default value, for documentation. If doesn't create a default, throws - * and UnsupportedOperationException - */ - public String typeDefaultDocString(ArgumentSource source) { - throw new UnsupportedOperationException(); - } - - /** - * Generates a default for the given type. - * - * @param parsingEngine the parsing engine used to validate this argument type descriptor. - * @param source Source of the command-line argument. - * @param type Type of value to create, in case the command-line argument system wants influence. - * @return A default value for the given type. - */ - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } - - /** - * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. - * @param source Source class and field for the given argument. - * @return A list of command-line argument definitions supporting this field. - */ - public List createArgumentDefinitions( ArgumentSource source ) { - return Collections.singletonList(createDefaultArgumentDefinition(source)); - } - - /** - * Parses an argument source to an object. - * WARNING! Mandatory side effect of parsing! Each parse routine should register the tags it finds with the proper CommandLineProgram. - * TODO: Fix this, perhaps with an event model indicating that a new argument has been created. - * - * @param parsingEngine The engine responsible for parsing. - * @param source The source used to find the matches. - * @param matches The matches for the source. - * @return The parsed object. - */ - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { - return parse(parsingEngine, source, source.field.getGenericType(), matches); - } - - /** - * Returns true if the field is a collection or an array. - * @param source The argument source to check. - * @return true if the field is a collection or an array. - */ - public boolean isMultiValued( ArgumentSource source ) { - Class argumentType = source.field.getType(); - return Collection.class.isAssignableFrom(argumentType) || argumentType.isArray(); - } - - /** - * By default, argument sources create argument definitions with a set of default values. - * Use this method to create the one simple argument definition. - * @param source argument source for which to create a default definition. - * @return The default definition for this argument source. - */ - protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { - Annotation argumentAnnotation = getArgumentAnnotation(source); - return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), - source.field.getType(), - ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), - ArgumentDefinition.getShortName(argumentAnnotation), - ArgumentDefinition.getDoc(argumentAnnotation), - source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), - source.isFlag(), - source.isMultiValued(), - source.isHidden(), - makeRawTypeIfNecessary(getCollectionComponentType(source.field)), - ArgumentDefinition.getExclusiveOf(argumentAnnotation), - ArgumentDefinition.getValidationRegex(argumentAnnotation), - getValidOptions(source) ); - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - protected Type getCollectionComponentType( Field field ) { - return null; - } - - /** - * Parses the argument matches for a class type into an object. - * @param source The original argument source used to find the matches. - * @param type The current class type being inspected. May not match the argument source.field.getType() if this as a collection for example. - * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. - * @return The individual parsed object matching the argument match with Class type. - */ - public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); - - /** - * If the argument source only accepts a small set of options, populate the returned list with - * those options. Otherwise, leave the list empty. - * @param source Original field specifying command-line arguments. - * @return A list of valid options. - */ - protected List getValidOptions( ArgumentSource source ) { - if(!source.field.getType().isEnum()) - return null; - List validOptions = new ArrayList(); - for(Object constant: source.field.getType().getEnumConstants()) - validOptions.add(constant.toString()); - return validOptions; - } - - /** - * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return true if the argument is present, or false if not present. - */ - protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { - for( ArgumentMatch match: matches ) { - if( match.definition.equals(definition) ) - return true; - } - return false; - } - - /** - * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. - * If the argument matches multiple values, an exception will be thrown. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or null if not present. - */ - protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { - Collection argumentValues = getArgumentValues( definition, matches ); - if( argumentValues.size() > 1 ) - throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); - return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; - } - - /** - * Gets the tags associated with a given command-line argument. - * If the argument matches multiple values, an exception will be thrown. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or null if not present. - */ - protected Tags getArgumentTags(ArgumentMatches matches) { - Tags tags = new Tags(); - for(ArgumentMatch match: matches) { - if(!tags.isEmpty() && !match.tags.isEmpty()) - throw new ReviewedStingException("BUG: multiple conflicting sets of tags are available, and the type descriptor specifies no way of resolving the conflict."); - tags = match.tags; - } - return tags; - } - - /** - * Gets the values of an argument with the given full name, from the collection of ArgumentMatches. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or an empty collection if not present. - */ - protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { - Collection values = new ArrayList(); - for( ArgumentMatch match: matches ) { - if( match.definition.equals(definition) ) - values.addAll(match.values()); - } - return values; - } - - /** - * Retrieves the argument description from the given argument source. Will throw an exception if - * the given ArgumentSource - * @param source source of the argument. - * @return Argument description annotation associated with the given field. - */ - @SuppressWarnings("unchecked") - protected static Annotation getArgumentAnnotation( ArgumentSource source ) { - for (Class annotation: ARGUMENT_ANNOTATIONS) - if (source.field.isAnnotationPresent(annotation)) - return source.field.getAnnotation(annotation); - throw new ReviewedStingException("ArgumentAnnotation is not present for the argument field: " + source.field.getName()); - } - - /** - * Returns true if an argument annotation is present - * @param field The field to check for an annotation. - * @return True if an argument annotation is present on the field. - */ - @SuppressWarnings("unchecked") - public static boolean isArgumentAnnotationPresent(Field field) { - for (Class annotation: ARGUMENT_ANNOTATIONS) - if (field.isAnnotationPresent(annotation)) - return true; - return false; - } - - /** - * Returns true if the given annotation is hidden from the help system. - * @param field Field to test. - * @return True if argument should be hidden. False otherwise. - */ - public static boolean isArgumentHidden(Field field) { - return field.isAnnotationPresent(Hidden.class); - } - - public static Class makeRawTypeIfNecessary(Type t) { - if ( t == null ) - return null; - else if ( t instanceof ParameterizedType ) - return (Class)((ParameterizedType) t).getRawType(); - else if ( t instanceof Class ) { - return (Class)t; - } else { - throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); - } - } - - /** - * The actual argument parsing method. - * @param source source - * @param type type to check - * @param matches matches - * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. - */ - protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - @SuppressWarnings("unchecked") - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - String name = defaultDefinition.fullName; - - return parseBinding(value, parameterType, type, name, tags, source.field.getName()); - } - - /** - * - * @param value The source of the binding - * @param parameterType The Tribble Feature parameter type - * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. - * @param bindingName The name of the binding passed to the constructor. - * @param tags Tags for the binding used for parsing and passed to the constructor. - * @param fieldName The name of the field that was parsed. Used for error reporting. - * @return The newly created binding object of type bindingClass. - */ - public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass, - String bindingName, Tags tags, String fieldName) { - try { - String tribbleType = null; - // must have one or two tag values here - if ( tags.getPositionalTags().size() > 2 ) { - throw new UserException.CommandLineException( - String.format("Unexpected number of positional tags for argument %s : %s. " + - "Rod bindings only support -X:type and -X:name,type argument styles", - value.asString(), fieldName)); - } else if ( tags.getPositionalTags().size() == 2 ) { - // -X:name,type style - bindingName = tags.getPositionalTags().get(0); - tribbleType = tags.getPositionalTags().get(1); - - FeatureManager manager = new FeatureManager(); - if ( manager.getByName(tribbleType) == null ) - throw new UserException.UnknownTribbleType( - tribbleType, - String.format("Unable to find tribble type '%s' provided on the command line. " + - "Please select a correct type from among the supported types:%n%s", - tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); - - } else { - // case with 0 or 1 positional tags - FeatureManager manager = new FeatureManager(); - - // -X:type style is a type when we cannot determine the type dynamically - String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; - if ( tag1 != null ) { - if ( manager.getByName(tag1) != null ) // this a type - tribbleType = tag1; - else - bindingName = tag1; - } - - if ( tribbleType == null ) { - // try to determine the file type dynamically - File file = value.asFile(); - if ( file.canRead() && file.isFile() ) { - FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); - if ( featureDescriptor != null ) { - tribbleType = featureDescriptor.getName(); - logger.info("Dynamically determined type of " + file + " to be " + tribbleType); - } - } - - if ( tribbleType == null ) { - // IntervalBinding can be created from a normal String - Class rawType = (makeRawTypeIfNecessary(bindingClass)); - try { - return rawType.getConstructor(String.class).newInstance(value.asString()); - } catch (NoSuchMethodException e) { - /* ignore */ - } - - if ( ! file.exists() ) { - throw new UserException.CouldNotReadInputFile(file, "file does not exist"); - } else if ( ! file.canRead() || ! file.isFile() ) { - throw new UserException.CouldNotReadInputFile(file, "file could not be read"); - } else { - throw new UserException.CommandLineException( - String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + - "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", - manager.userFriendlyListOfAvailableFeatures(parameterType))); - } - } - } - } - - Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags); - } catch (Exception e) { - if ( e instanceof UserException ) - throw ((UserException)e); - else - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s. Message: %s", - value, fieldName, e.getMessage())); - } - } -} - -/** - * Parser for RodBinding objects - */ -class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want RodBinding class objects - * @param type The type to check. - * @return true if the provided class is a RodBinding.class - */ - @Override - public boolean supports( Class type ) { - return isRodBinding(type); - } - - public static boolean isRodBinding( Class type ) { - return RodBinding.class.isAssignableFrom(type); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } - - @Override - @SuppressWarnings("unchecked") - public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - return RodBinding.makeUnbound((Class)parameterType); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "none"; - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - Tags tags = getArgumentTags(matches); - RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); - parsingEngine.addTags(rbind, tags); - parsingEngine.addRodBinding(rbind); - return rbind; - } -} - -/** - * Parser for IntervalBinding objects - */ -class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want IntervalBinding class objects - * @param type The type to check. - * @return true if the provided class is an IntervalBinding.class - */ - @Override - public boolean supports( Class type ) { - return isIntervalBinding(type); - } - - public static boolean isIntervalBinding( Class type ) { - return IntervalBinding.class.isAssignableFrom(type); - } - - /** - * See note from RodBindingArgumentTypeDescriptor.parse(). - * - * @param parsingEngine parsing engine - * @param source source - * @param type type to check - * @param matches matches - * @return the IntervalBinding object. - */ - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return parseBinding(source, type, matches, getArgumentTags(matches)); - } -} - -/** - * Parse simple argument types: java primitives, wrapper classes, and anything that has - * a simple String constructor. - */ -class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { - @Override - public boolean supports( Class type ) { - if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) || IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ) return false; - if ( type.isPrimitive() ) return true; - if ( type.isEnum() ) return true; - if ( primitiveToWrapperMap.containsValue(type) ) return true; - - try { - type.getConstructor(String.class); - return true; - } - catch( Exception ex ) { - // An exception thrown above means that the String constructor either doesn't - // exist or can't be accessed. In either case, this descriptor doesn't support this type. - return false; - } - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { - Class type = makeRawTypeIfNecessary(fulltype); - if (source.isFlag()) - return true; - - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - Object result; - Tags tags = getArgumentTags(matches); - - // lets go through the types we support - try { - if (type.isPrimitive()) { - Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); - if(value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - result = valueOf.invoke(null,value.asString().trim()); - } else if (type.isEnum()) { - Object[] vals = type.getEnumConstants(); - Object defaultEnumeration = null; // as we look at options, record the default option if it exists - for (Object val : vals) { - if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; - try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } - catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } - } - // if their argument has no value (null), and there's a default, return that default for the enum value - if (defaultEnumeration != null && value == null) - result = defaultEnumeration; - // if their argument has no value and there's no default, throw a missing argument value exception. - // TODO: Clean this up so that null values never make it to this point. To fix this, we'll have to clean up the implementation of -U. - else if (value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - else - throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); - } else if (type.equals(File.class)) { - result = value == null ? null : value.asFile(); - } else { - Constructor ctor = type.getConstructor(String.class); - result = ctor.newInstance(value == null ? null : value.asString()); - } - } catch (UserException e) { - throw e; - } catch (InvocationTargetException e) { - throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", - value, source.field.getName())); - } catch (Exception e) { - throw new DynamicClassResolutionException(String.class, e); - } - - // TODO FIXME! - - // WARNING: Side effect! - parsingEngine.addTags(result,tags); - - return result; - } - - - /** - * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer - * this association available in the JRE? - */ - private static Map primitiveToWrapperMap = new HashMap() { - { - put( Boolean.TYPE, Boolean.class ); - put( Character.TYPE, Character.class ); - put( Byte.TYPE, Byte.class ); - put( Short.TYPE, Short.class ); - put( Integer.TYPE, Integer.class ); - put( Long.TYPE, Long.class ); - put( Float.TYPE, Float.class ); - put( Double.TYPE, Double.class ); - } - }; -} - -/** - * Process compound argument types: arrays, and typed and untyped collections. - */ -class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { - @Override - public boolean supports( Class type ) { - return ( Collection.class.isAssignableFrom(type) || type.isArray() ); - } - - @Override - @SuppressWarnings("unchecked") - public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { - Class type = makeRawTypeIfNecessary(fulltype); - Type componentType; - Object result; - - if( Collection.class.isAssignableFrom(type) ) { - - // If this is a generic interface, pick a concrete implementation to create and pass back. - // Because of type erasure, don't worry about creating one of exactly the correct type. - if( Modifier.isInterface(type.getModifiers()) || Modifier.isAbstract(type.getModifiers()) ) - { - if( java.util.List.class.isAssignableFrom(type) ) type = ArrayList.class; - else if( java.util.Queue.class.isAssignableFrom(type) ) type = java.util.ArrayDeque.class; - else if( java.util.Set.class.isAssignableFrom(type) ) type = java.util.TreeSet.class; - } - - componentType = getCollectionComponentType( source.field ); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); - - Collection collection; - try { - collection = (Collection)type.newInstance(); - } - catch (InstantiationException e) { - logger.fatal("ArgumentParser: InstantiationException: cannot convert field " + source.field.getName()); - throw new ReviewedStingException("constructFromString:InstantiationException: Failed conversion " + e.getMessage()); - } - catch (IllegalAccessException e) { - logger.fatal("ArgumentParser: IllegalAccessException: cannot convert field " + source.field.getName()); - throw new ReviewedStingException("constructFromString:IllegalAccessException: Failed conversion " + e.getMessage()); - } - - for( ArgumentMatch match: matches ) { - for( ArgumentMatch value: match ) { - Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); - collection.add( object ); - // WARNING: Side effect! - parsingEngine.addTags(object,value.tags); - } - } - - result = collection; - - } - else if( type.isArray() ) { - componentType = type.getComponentType(); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); - - // Assemble a collection of individual values used in this computation. - Collection values = new ArrayList(); - for( ArgumentMatch match: matches ) - for( ArgumentMatch value: match ) - values.add(value); - - result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); - - int i = 0; - for( ArgumentMatch value: values ) { - Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); - Array.set(result,i++,object); - // WARNING: Side effect! - parsingEngine.addTags(object,value.tags); - } - } - else - throw new ReviewedStingException("Unsupported compound argument type: " + type); - - return result; - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - @Override - protected Type getCollectionComponentType( Field field ) { - // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length > 1 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return parameterizedType.getActualTypeArguments()[0]; - } - else - return String.class; - } -} - -class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * The multiplexer controlling how data is split. - */ - private final Multiplexer multiplexer; - - /** - * The set of identifiers for the multiplexed entries. - */ - private final Collection multiplexedIds; - - public MultiplexArgumentTypeDescriptor() { - this.multiplexer = null; - this.multiplexedIds = null; - } - - /** - * Private constructor to use in creating a closure of the MultiplexArgumentTypeDescriptor specific to the - * given set of multiplexed ids. - * @param multiplexedIds The collection of multiplexed entries - */ - private MultiplexArgumentTypeDescriptor(final Multiplexer multiplexer, final Collection multiplexedIds) { - this.multiplexer = multiplexer; - this.multiplexedIds = multiplexedIds; - } - - @Override - public boolean supports( Class type ) { - return ( Map.class.isAssignableFrom(type) ); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { - // Multiplexing always creates a type default. - return true; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(multiplexer == null || multiplexedIds == null) - throw new ReviewedStingException("No multiplexed ids available"); - - Map multiplexedMapping = new HashMap(); - Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); - ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); - - for(Object id: multiplexedIds) { - Object value = null; - if(componentTypeDescriptor.createsTypeDefault(source)) - value = componentTypeDescriptor.createTypeDefault(parsingEngine,source,componentType); - multiplexedMapping.put(id,value); - } - return multiplexedMapping; - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "None"; - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - if(multiplexedIds == null) - throw new ReviewedStingException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); - - Map multiplexedMapping = new HashMap(); - - Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); - - - for(Object id: multiplexedIds) { - Object value = parsingEngine.selectBestTypeDescriptor(componentType).parse(parsingEngine,source,componentType,matches.transform(multiplexer,id)); - multiplexedMapping.put(id,value); - } - - parsingEngine.addTags(multiplexedMapping,getArgumentTags(matches)); - - return multiplexedMapping; - } - - public MultiplexArgumentTypeDescriptor createCustomTypeDescriptor(ParsingEngine parsingEngine,ArgumentSource dependentArgument,Object containingObject) { - String[] sourceFields = dependentArgument.field.getAnnotation(Multiplex.class).arguments(); - - List allSources = parsingEngine.extractArgumentSources(containingObject.getClass()); - Class[] sourceTypes = new Class[sourceFields.length]; - Object[] sourceValues = new Object[sourceFields.length]; - int currentField = 0; - - for(String sourceField: sourceFields) { - boolean fieldFound = false; - for(ArgumentSource source: allSources) { - if(!source.field.getName().equals(sourceField)) - continue; - if(source.field.isAnnotationPresent(Multiplex.class)) - throw new ReviewedStingException("Command-line arguments can only depend on independent fields"); - sourceTypes[currentField] = source.field.getType(); - sourceValues[currentField] = JVMUtils.getFieldValue(source.field,containingObject); - currentField++; - fieldFound = true; - } - if(!fieldFound) - throw new ReviewedStingException(String.format("Unable to find source field %s, referred to by dependent field %s",sourceField,dependentArgument.field.getName())); - } - - Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); - Constructor multiplexerConstructor; - try { - multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); - multiplexerConstructor.setAccessible(true); - } - catch(NoSuchMethodException ex) { - throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - - Multiplexer multiplexer; - try { - multiplexer = multiplexerConstructor.newInstance(sourceValues); - } - catch(IllegalAccessException ex) { - throw new ReviewedStingException(String.format("Constructor for class %s with parameters %s is inaccessible",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - catch(InstantiationException ex) { - throw new ReviewedStingException(String.format("Can't create class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - catch(InvocationTargetException ex) { - throw new ReviewedStingException(String.format("Can't invoke constructor of class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - - return new MultiplexArgumentTypeDescriptor(multiplexer,multiplexer.multiplex()); - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - @Override - protected Type getCollectionComponentType( Field field ) { - // Multiplex arguments must resolve to maps from which the clp should extract the second type. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length != 2 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return (Class)parameterizedType.getActualTypeArguments()[1]; - } - else - return String.class; - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java deleted file mode 100644 index f00bd0ad6..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ /dev/null @@ -1,444 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.help.HelpFormatter; - -import java.io.IOException; -import java.util.*; - -public abstract class CommandLineProgram { - - /** The command-line program and the arguments it returned. */ - public ParsingEngine parser = null; - - /** the default log level */ - @Argument(fullName = "logging_level", - shortName = "l", - doc = "Set the minimum level of logging, i.e. setting INFO get's you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging.", - required = false) - protected String logging_level = "INFO"; - - - /** where to send the output of our logger */ - @Output(fullName = "log_to_file", - shortName = "log", - doc = "Set the logging location", - required = false) - protected String toFile = null; - - /** this is used to indicate if they've asked for help */ - @Argument(fullName = "help", shortName = "h", doc = "Generate this help message", required = false) - public Boolean help = false; - - /** This is used to indicate if they've asked for the version information */ - @Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false) - public Boolean version = false; - - - /** our logging output patterns */ - private static final String patternString = "%-5p %d{HH:mm:ss,SSS} %C{1} - %m %n"; - - static { - /** - * The very first thing that any Sting application does is forces the JVM locale into US English, so that we don't have - * to think about number formatting issues. - */ - forceJVMLocaleToUSEnglish(); - // setup a basic log configuration - CommandLineUtils.configureConsoleLogging(); - } - - - /** - * Allows a given application to return a brief description of itself. - * - * @return An ApplicationDetails object describing the current application. Should not be null. - */ - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(ApplicationDetails.createDefaultHeader(getClass()), - Collections.emptyList(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Collections.emptyList(); - } - - /** - * Will this application want to vary its argument list dynamically? - * If so, parse the command-line options and then prompt the subclass to return - * a list of argument providers. - * - * @return Whether the application should vary command-line arguments dynamically. - */ - protected boolean canAddArgumentsDynamically() { return false; } - - /** - * Provide a list of object to inspect, looking for additional command-line arguments. - * - * @return A list of objects to inspect. - */ - protected Class[] getArgumentSources() { - return new Class[]{}; - } - - /** - * Name this argument source. Provides the (full) class name as a default. - * - * @param source The argument source. - * - * @return a name for the argument source. - */ - protected String getArgumentSourceName( Class source ) { return source.toString(); } - - /** - * Sets the command-line parsing engine. Necessary for unit testing purposes. - * @param parser the new command-line parsing engine - */ - public void setParser( ParsingEngine parser ) { - this.parser = parser; - } - - /** - * this is the function that the inheriting class can expect to have called - * when all the argument processing is done - * - * @return the return code to exit the program with - * @throws Exception when an exception occurs - */ - protected abstract int execute() throws Exception; - - public static int result = -1; - - @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args) throws Exception { - start(clp, args, false); - } - - /** - * This function is called to start processing the command line, and kick - * off the execute message of the program. - * - * @param clp the command line program to execute - * @param args the command line arguments passed in - * @param dryRun dry run - * @throws Exception when an exception occurs - */ - @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { - - try { - // setup our log layout - PatternLayout layout = new PatternLayout(); - - Logger logger = CommandLineUtils.getStingLogger(); - - // now set the layout of all the loggers to our layout - CommandLineUtils.setLayout(logger, layout); - - // Initialize the logger using the defaults. - clp.setupLoggerLevel(layout); - - // setup the parser - ParsingEngine parser = clp.parser = new ParsingEngine(clp); - parser.addArgumentSource(clp.getClass()); - - Map parsedArgs; - - // process the args - if (clp.canAddArgumentsDynamically()) { - // if the command-line program can toss in extra args, fetch them and reparse the arguments. - parser.parse(args); - - // Allow invalid and missing required arguments to pass this validation step. - // - InvalidArgument in case these arguments are specified by plugins. - // - MissingRequiredArgument in case the user requested help. Handle that later, once we've - // determined the full complement of arguments. - if ( ! dryRun ) - parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, - ParsingEngine.ValidationType.InvalidArgument)); - parser.loadArgumentsIntoObject(clp); - - // Initialize the logger using the loaded command line. - clp.setupLoggerLevel(layout); - - Class[] argumentSources = clp.getArgumentSources(); - for (Class argumentSource : argumentSources) - parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); - parsedArgs = parser.parse(args); - - if (isVersionPresent(parser)) - printVersionAndExit(); - - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); - - if ( ! dryRun ) parser.validate(); - } else { - parsedArgs = parser.parse(args); - - if ( ! dryRun ) { - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); - - parser.validate(); - } - parser.loadArgumentsIntoObject(clp); - - // Initialize the logger using the loaded command line. - clp.setupLoggerLevel(layout); - } - - if ( ! dryRun ) { - // if they specify a log location, output our data there - if (clp.toFile != null) { - FileAppender appender; - try { - appender = new FileAppender(layout, clp.toFile, false); - logger.addAppender(appender); - } catch (IOException e) { - throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); - } - } - - // regardless of what happens next, generate the header information - HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); - - // call the execute - CommandLineProgram.result = clp.execute(); - } - } - catch (ArgumentException e) { - //clp.parser.printHelp(clp.getApplicationDetails()); - // Rethrow the exception to exit with an error. - throw e; - } - } - - /** - * Find fields in the object obj that look like command-line arguments, and put command-line - * arguments into them. - * - * @param obj Object to inspect for command line arguments. - */ - public void loadArgumentsIntoObject(Object obj) { - parser.loadArgumentsIntoObject(obj); - } - - /** - * this function checks the logger level passed in on the command line, taking the lowest - * level that was provided. - * @param layout Pattern layout to format based on the logger level. - */ - private void setupLoggerLevel(PatternLayout layout) { - layout.setConversionPattern(patternString); - - // set the default logger level - Level par; - if (logging_level.toUpperCase().equals("DEBUG")) { - par = Level.DEBUG; - } else if (logging_level.toUpperCase().equals("ERROR")) { - par = Level.ERROR; - } else if (logging_level.toUpperCase().equals("FATAL")) { - par = Level.FATAL; - } else if (logging_level.toUpperCase().equals("INFO")) { - par = Level.INFO; - } else if (logging_level.toUpperCase().equals("WARN")) { - par = Level.WARN; - } else if (logging_level.toUpperCase().equals("OFF")) { - par = Level.OFF; - } else { - // we don't understand the logging level, let's get out of here - throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (INFO, DEBUG, ERROR, FATAL, OFF)"); - } - - Logger.getRootLogger().setLevel(par); - } - - /** - * a function used to indicate an error occurred in the command line tool - */ - private static void printDocumentationReference() { - errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); - errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); - } - - - /** - * Do a cursory search for the given argument. - * - * @param parser Parser - * - * @return True if help is present; false otherwise. - */ - private static boolean isHelpPresent(ParsingEngine parser) { - return parser.isArgumentPresent("help"); - } - - /** - * Print help and exit. - * - * @param clp Instance of the command-line program. - * @param parser True if help is present; false otherwise. - */ - private static void printHelpAndExit(CommandLineProgram clp, ParsingEngine parser) { - parser.printHelp(clp.getApplicationDetails()); - System.exit(0); - } - - /** - * Do a cursory search for the argument "version". - * - * @param parser Parser - * - * @return True if version is present; false otherwise. - */ - private static boolean isVersionPresent(ParsingEngine parser) { - return parser.isArgumentPresent("version"); - } - - /** - * Print help and exit. - */ - private static void printVersionAndExit() { - System.out.println(CommandLineGATK.getVersionNumber().toString()); - System.exit(0); - } - - - private static void errorPrintf(String format, Object... s) { - String formatted = String.format(format, s); - - if ( formatted.trim().equals("") ) - System.err.println("##### ERROR"); - else { - for ( String part : formatted.split("\n") ) { - System.err.println("##### ERROR " + part); - } - } - } - - - /** - * used to indicate an error occured - * - * @param msg the message - * @param t the error - */ - public static void exitSystemWithError(String msg, final Throwable t) { - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("stack trace %n"); - t.printStackTrace(); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); - errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); - printDocumentationReference(); - if ( msg == null ) // some exceptions don't have detailed messages - msg = "Code exception (see stack trace for error itself)"; - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", msg.trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - public static void exitSystemWithUserError(final Exception e) { - if ( e.getMessage() == null ) - throw new ReviewedStingException("UserException found with no message!", e); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A USER ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); - errorPrintf("The error message below tells you what is the problem.%n"); - errorPrintf("%n"); - errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); - errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); - errorPrintf("%n"); - printDocumentationReference(); - errorPrintf("%n"); - errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - public static void exitSystemWithSamError(final Throwable t) { - if ( t.getMessage() == null ) - throw new ReviewedStingException("SamException found with no message!", t); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); - errorPrintf("The error message below tells you what is the problem.%n"); - errorPrintf("%n"); - printDocumentationReference(); - errorPrintf("%n"); - errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); - errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); - errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); - errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - - /** - * used to indicate an error occured - * - * @param t the exception that occurred - */ - public static void exitSystemWithError(Throwable t) { - exitSystemWithError(t.getMessage(), t); - } - - /** - * A hack to ensure that numbers are always formatted in the US style. - */ - protected static void forceJVMLocaleToUSEnglish() { - Locale.setDefault(Locale.US); - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java deleted file mode 100644 index ddedda054..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java +++ /dev/null @@ -1,192 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.apache.log4j.Appender; -import org.apache.log4j.ConsoleAppender; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.lang.annotation.Annotation; -import java.util.Collections; -import java.util.Enumeration; -import java.util.LinkedHashMap; -import java.util.Map; - -/** - * Static utility methods for working with command-line arguments. - * - * @author mhanna - * @version 0.1 - */ -public class CommandLineUtils { - - /** - * Returns a key-value mapping of the command-line arguments passed into the GATK. - * Will be approximate; this class doesn't have all the required data to completely - * reconstruct the list of command-line arguments from the given objects. - * - * @param parsingEngine The parsing engine - * @param argumentProviders The providers of command-line arguments. - * @return A key-value mapping of argument full names to argument values. Produces best string representation - * possible given the information available. - */ - public static Map getApproximateCommandLineArguments(ParsingEngine parsingEngine, Object... argumentProviders) { - return getApproximateCommandLineArguments(parsingEngine, false, argumentProviders); - } - - /** - * Returns a key-value mapping of the command-line arguments passed into the GATK. - * Will be approximate; this class doesn't have all the required data to completely - * reconstruct the list of command-line arguments from the given objects. - * - * @param parsingEngine The parsing engine - * @param skipObjectPointers Should we skip arguments whose values are pointers (and don't print nicely)? - * @param argumentProviders The providers of command-line arguments. - * @return A key-value mapping of argument full names to argument values. Produces best string representation - * possible given the information available. - */ - public static Map getApproximateCommandLineArguments(ParsingEngine parsingEngine, boolean skipObjectPointers, Object... argumentProviders) { - Map commandLineArguments = new LinkedHashMap(); - - for(Object argumentProvider: argumentProviders) { - Map argBindings = parsingEngine.extractArgumentBindings(argumentProvider); - for(Map.Entry elt: argBindings.entrySet()) { - Object argumentValue = elt.getValue(); - - String argumentValueString = argumentValue != null ? argumentValue.toString() : null; - if ( skipObjectPointers && isObjectPointer(argumentValueString) ) - continue; - - for(ArgumentDefinition definition: elt.getKey().createArgumentDefinitions()) { - String argumentName = definition.fullName; - commandLineArguments.put(argumentName,argumentValueString); - } - } - } - - return commandLineArguments; - } - - /** - * Create an approximate list of command-line arguments based on the given argument providers. - * @param parsingEngine The parsing engine - * @param argumentProviders Argument providers to inspect. - * @return A string representing the given command-line arguments. - */ - public static String createApproximateCommandLineArgumentString(ParsingEngine parsingEngine, Object... argumentProviders) { - return createApproximateCommandLineArgumentString(parsingEngine, true, argumentProviders); - } - - /** - * Create an approximate list of command-line arguments based on the given argument providers. - * @param parsingEngine The parsing engine - * @param skipObjectPointers Should we skip arguments whose values are pointers (and don't print nicely)? - * @param argumentProviders Argument providers to inspect. - * @return A string representing the given command-line arguments. - */ - public static String createApproximateCommandLineArgumentString(ParsingEngine parsingEngine, boolean skipObjectPointers, Object... argumentProviders) { - Map commandLineArgs = getApproximateCommandLineArguments(parsingEngine, skipObjectPointers, argumentProviders); - StringBuffer sb = new StringBuffer(); - - boolean first = true; - for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) { - if ( !first ) - sb.append(" "); - sb.append(commandLineArg.getKey()); - sb.append("="); - sb.append(commandLineArg.getValue()); - first = false; - } - - return sb.toString(); - } - - /** - * A hack to get around the fact that Java doesn't like inheritance in Annotations. - * @param annotation to run the method on - * @param method the method to invoke - * @return the return value of the method - */ - public static Object getValue(Annotation annotation, String method) { - try { - return annotation.getClass().getMethod(method).invoke(annotation); - } catch (Exception e) { - throw new ReviewedStingException("Unable to access method " + method + " on annotation " + annotation.getClass(), e); - } - } - - // The problem here is that some of the fields being output are Objects - and those - // Objects don't overload toString() so that the output is just the memory pointer - // to the Object. Because those values are non-deterministic, they don't merge well - // into BAM/VCF headers (plus, it's just damn ugly). Perhaps there's a better way to - // do this, but at least this one works for the moment. - private static final String pointerRegexp = ".+@[0-9a-fA-F]+$"; - private static boolean isObjectPointer(String s) { - return s != null && s.matches(pointerRegexp); - } - - /** - * Returns the root logger for all Sting code. - * @return the root logger for all Sting code. - */ - public static Logger getStingLogger() { - return Logger.getLogger("org.broadinstitute.sting"); - } - - /** - * Enables console logging. - */ - @SuppressWarnings("unchecked") - public static void configureConsoleLogging() { - // Check to see if a console logger has already been enabled. - for (Logger logger = getStingLogger(); logger != null; logger = (Logger)logger.getParent()) { - Enumeration e = (Enumeration) logger.getAllAppenders(); - for (Appender appender: Collections.list(e)) { - if (appender instanceof ConsoleAppender) - return; - } - } - // Extracted from BasicConfigurator.configure(), but only applied to the Sting logger. - Logger.getRootLogger().addAppender(new ConsoleAppender( - new PatternLayout(PatternLayout.TTCC_CONVERSION_PATTERN))); - } - - /** - * Sets the layout of the logger. - * @param logger The logger. - * @param layout The layout. - */ - @SuppressWarnings("unchecked") - public static void setLayout(Logger logger, PatternLayout layout) { - for (; logger != null; logger = (Logger)logger.getParent()) { - Enumeration e = (Enumeration) logger.getAllAppenders(); - for (Appender appender: Collections.list(e)) - appender.setLayout(layout); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java deleted file mode 100644 index b491c9f3d..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java +++ /dev/null @@ -1,70 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.broad.tribble.Feature; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; - -import java.util.List; - -public class IntervalArgumentCollection { - /** - * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). - * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). - * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. - */ - @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> intervals = null; - - /** - * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). - * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). - */ - @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> excludeIntervals = null; - - /** - * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions - * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). - */ - @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) - public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; - - /** - * Should abutting (but not overlapping) intervals be treated as separate intervals? - */ - @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) - public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - - /** - * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. - */ - @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) - public int intervalPadding = 0; -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java deleted file mode 100644 index 9253e1ee5..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ /dev/null @@ -1,108 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import com.google.java.contract.Requires; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.FeatureReader; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalUtils; - -import java.util.*; - -/** - * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. - * - * The IntervalBinding is a formal GATK argument that bridges between a walker and - * the engine to construct intervals for traversal at runtime. The IntervalBinding can - * either be a RodBinding, a string of one interval, or a file with interval strings. - * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. - * - * Note that this class is immutable. - */ -public final class IntervalBinding { - - private RodBinding featureIntervals; - private String stringIntervals; - - @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) - public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { - featureIntervals = new RodBinding(type, rawName, source, tribbleType, tags); - } - - @Requires({"intervalArgument != null"}) - public IntervalBinding(String intervalArgument) { - stringIntervals = intervalArgument; - } - - public String getSource() { - if ( featureIntervals != null ) - return featureIntervals.getSource(); - return stringIntervals; - } - - public List getIntervals(final GenomeAnalysisEngine toolkit) { - return getIntervals(toolkit.getGenomeLocParser()); - } - - public List getIntervals(final GenomeLocParser genomeLocParser) { - List intervals; - - if ( featureIntervals != null ) { - intervals = new ArrayList(); - - // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files - - final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); - if ( codec instanceof ReferenceDependentFeatureCodec ) - ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); - try { - FeatureReader reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false); - for ( Feature feature : reader.iterator() ) - intervals.add(genomeLocParser.createGenomeLoc(feature)); - } catch (Exception e) { - throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e); - } - - } else { - intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); - } - - Collections.sort(intervals); - return intervals; - } - - public String toString() { - return getSource(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java deleted file mode 100644 index 5e863f4f7..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ /dev/null @@ -1,735 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import com.google.java.contract.Requires; -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; -import org.broadinstitute.sting.utils.help.HelpFormatter; - -import java.io.File; -import java.io.IOException; -import java.lang.reflect.Field; -import java.util.*; - -/** - * A parser for Sting command-line arguments. - */ -public class ParsingEngine { - /** - * The loaded argument sources along with their back definitions. - */ - private Map argumentSourcesByDefinition = new HashMap(); - - /** - * A list of defined arguments against which command lines are matched. - * Package protected for testing access. - */ - public ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); - - /** - * A list of matches from defined arguments to command-line text. - * Indicates as best as possible where command-line text remains unmatched - * to existing arguments. - */ - private ArgumentMatches argumentMatches = null; - - /** - * Techniques for parsing and for argument lookup. - */ - private List parsingMethods = new ArrayList(); - - /** - * All of the RodBinding objects we've seen while parsing - */ - private List rodBindings = new ArrayList(); - - /** - * Class reference to the different types of descriptors that the create method can create. - * The type of set used must be ordered (but not necessarily sorted). - */ - private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), - new IntervalBindingArgumentTypeDescriptor(), - new RodBindingArgumentTypeDescriptor(), - new CompoundArgumentTypeDescriptor(), - new MultiplexArgumentTypeDescriptor()) ); - - private Set argumentTypeDescriptors = new LinkedHashSet(); - - /** - * List of tags associated with the given instantiation of the command-line argument. - */ - private final Map tags = new IdentityHashMap(); - - private PluginManager argumentProviderPluginManager = - new PluginManager(ParsingEngineArgumentProvider.class); - - /** - * our log, which we want to capture anything from org.broadinstitute.sting - */ - protected static Logger logger = Logger.getLogger(ParsingEngine.class); - - public ParsingEngine( CommandLineProgram clp ) { - RodBinding.resetNameCounter(); - parsingMethods.add( ParsingMethod.FullNameParsingMethod ); - parsingMethods.add( ParsingMethod.ShortNameParsingMethod ); - - // Order matters here! Make sure the clp's new type descriptors go in before the original type descriptors. - if(clp != null) - argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors()); - argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS); - - List> providers = argumentProviderPluginManager.getPlugins(); - for (Class provider: providers) { - addArgumentSource(provider); - } - } - - /** - * Add a main argument source. Argument sources are expected to have - * any number of fields with an @Argument annotation attached. - * @param source An argument source from which to extract command-line arguments. - */ - public void addArgumentSource( Class source ) { - addArgumentSource(null, source); - } - - public ArgumentMatches getArgumentMatches() { - return argumentMatches; - } - - /** - * Add an argument source. Argument sources are expected to have - * any number of fields with an @Argument annotation attached. - * @param sourceName name for this argument source. 'Null' indicates that this source should be treated - * as the main module. - * @param sourceClass A class containing argument sources from which to extract command-line arguments. - */ - public void addArgumentSource( String sourceName, Class sourceClass ) { - List argumentsFromSource = new ArrayList(); - for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) { - List argumentDefinitions = argumentSource.createArgumentDefinitions(); - for(ArgumentDefinition argumentDefinition: argumentDefinitions) { - argumentSourcesByDefinition.put(argumentDefinition,argumentSource); - argumentsFromSource.add( argumentDefinition ); - } - } - argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) ); - } - - /** - * Do a cursory search to see if an argument with the given name is present. - * @param argumentFullName full name of the argument. - * @return True if the argument is present. False otherwise. - */ - public boolean isArgumentPresent( String argumentFullName ) { - ArgumentDefinition definition = - argumentDefinitions.findArgumentDefinition(argumentFullName,ArgumentDefinitions.FullNameDefinitionMatcher); - return argumentMatches.hasMatch(definition); - - } - - /** - * Parse the given set of command-line arguments, returning - * an ArgumentMatches object describing the best fit of these - * command-line arguments to the arguments that are actually - * required. - * @param tokens Tokens passed on the command line. - * @return The parsed arguments by file. - */ - public SortedMap parse( String[] tokens ) { - argumentMatches = new ArgumentMatches(); - SortedMap parsedArgs = new TreeMap(); - - List cmdLineTokens = Arrays.asList(tokens); - parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs); - - List providers = argumentProviderPluginManager.createAllTypes(); - - for (ParsingEngineArgumentProvider provider: providers) { - // Load the arguments ONLY into the provider. - // Validation may optionally run on the rest of the arguments. - loadArgumentsIntoObject(provider); - } - - for (ParsingEngineArgumentProvider provider: providers) { - provider.parse(this, parsedArgs); - } - - return parsedArgs; - } - - public void parse(ArgumentMatchSource matchSource, List tokens, - ArgumentMatches argumentMatches, SortedMap parsedArgs) { - ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1); - - int i = 0; - for (String token: tokens) { - // If the token is of argument form, parse it into its own argument match. - // Otherwise, pair it with the most recently used argument discovered. - ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); - if( isArgumentForm(token) ) { - ArgumentMatch argumentMatch = parseArgument( token, site ); - if( argumentMatch != null ) { - argumentMatches.mergeInto( argumentMatch ); - lastArgumentMatchSite = site; - } - } - else { - if( argumentMatches.hasMatch(lastArgumentMatchSite) && - !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite)) - argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) ); - else - argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) ); - - } - i++; - } - - parsedArgs.put(matchSource, new ParsedListArgs(tokens)); - } - - public void parsePairs(ArgumentMatchSource matchSource, List> tokens, - ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs, - SortedMap parsedArgs) { - int i = 0; - for (Pair pair: tokens) { - - ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); - List matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher); - ArgumentDefinition definition = null; - for (DefinitionMatcher matcher: matchers) { - definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher ); - if (definition != null) - break; - } - if (definition == null) - continue; - ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags()); - argumentMatches.mergeInto(argumentMatch); - argumentMatch.addValue(site, pair.getSecond()); - i++; - } - - parsedArgs.put(matchSource, matchSourceArgs); - } - - protected List getArguments(File file) { - try { - if (file.getAbsolutePath().endsWith(".list")) { - return getListArguments(file); - } - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - throw new UserException.CouldNotReadInputFile(file, "file extension is not .list"); - } - - private List getListArguments(File file) throws IOException { - ArrayList argsList = new ArrayList(); - for (String line: FileUtils.readLines(file)) - argsList.addAll(Arrays.asList(Utils.escapeExpressions(line))); - return argsList; - } - - public enum ValidationType { MissingRequiredArgument, - InvalidArgument, - InvalidArgumentValue, - ValueMissingArgument, - TooManyValuesForArgument, - MutuallyExclusive } - - /** - * Validates the list of command-line argument matches. - */ - public void validate() { - validate( EnumSet.noneOf(ValidationType.class) ); - } - - /** - * Validates the list of command-line argument matches. On failure throws an exception with detailed info about the - * particular failures. Takes an EnumSet indicating which validation checks to skip. - * @param skipValidationOf List of validation checks to skip. - */ - public void validate( EnumSet skipValidationOf ) { - // Find missing required arguments. - if( !skipValidationOf.contains(ValidationType.MissingRequiredArgument) ) { - Collection requiredArguments = - argumentDefinitions.findArgumentDefinitions( true, ArgumentDefinitions.RequiredDefinitionMatcher ); - Collection missingArguments = new ArrayList(); - for( ArgumentDefinition requiredArgument: requiredArguments ) { - if( !argumentMatches.hasMatch(requiredArgument) ) - missingArguments.add( requiredArgument ); - } - - if( missingArguments.size() > 0 ) - throw new MissingArgumentException( missingArguments ); - } - - // Find invalid arguments. Invalid arguments will have a null argument definition. - if( !skipValidationOf.contains(ValidationType.InvalidArgument) ) { - ArgumentMatches invalidArguments = argumentMatches.findUnmatched(); - if( invalidArguments.size() > 0 ) - throw new InvalidArgumentException( invalidArguments ); - } - - // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression. - if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) { - Collection verifiableArguments = - argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher ); - Collection> invalidValues = new ArrayList>(); - for( ArgumentDefinition verifiableArgument: verifiableArguments ) { - ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument ); - // Check to see whether an argument value was specified. Argument values must be provided - // when the argument name is specified and the argument is not a flag type. - for(ArgumentMatch verifiableMatch: verifiableMatches) { - ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument); - if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault()) - invalidValues.add(new Pair(verifiableArgument,null)); - } - - // Ensure that the field contents meet the validation criteria specified by the regular expression. - for( ArgumentMatch verifiableMatch: verifiableMatches ) { - for( ArgumentMatchValue value: verifiableMatch.values() ) { - if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) ) - invalidValues.add( new Pair(verifiableArgument, value.asString()) ); - } - } - } - - if( invalidValues.size() > 0 ) - throw new InvalidArgumentValueException( invalidValues ); - } - - // Find values without an associated mate. - if( !skipValidationOf.contains(ValidationType.ValueMissingArgument) ) { - if( argumentMatches.MissingArgument.values().size() > 0 ) - throw new UnmatchedArgumentException( argumentMatches.MissingArgument ); - } - - // Find arguments with too many values. - if( !skipValidationOf.contains(ValidationType.TooManyValuesForArgument)) { - Collection overvaluedArguments = new ArrayList(); - for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { - // Warning: assumes that definition is not null (asserted by checks above). - if( !argumentMatch.definition.isMultiValued && argumentMatch.values().size() > 1 ) - overvaluedArguments.add(argumentMatch); - } - - if( !overvaluedArguments.isEmpty() ) - throw new TooManyValuesForArgumentException(overvaluedArguments); - } - - // Find sets of options that are supposed to be mutually exclusive. - if( !skipValidationOf.contains(ValidationType.MutuallyExclusive)) { - Collection> invalidPairs = new ArrayList>(); - for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { - if( argumentMatch.definition.exclusiveOf != null ) { - for( ArgumentMatch conflictingMatch: argumentMatches.findSuccessfulMatches() ) { - // Skip over the current element. - if( argumentMatch == conflictingMatch ) - continue; - if( argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.fullName) || - argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.shortName)) - invalidPairs.add( new Pair(argumentMatch, conflictingMatch) ); - } - } - } - - if( !invalidPairs.isEmpty() ) - throw new ArgumentsAreMutuallyExclusiveException( invalidPairs ); - } - } - - /** - * Loads a set of matched command-line arguments into the given object. - * @param object Object into which to add arguments. - */ - public void loadArgumentsIntoObject( Object object ) { - List argumentSources = extractArgumentSources(object.getClass()); - - List dependentArguments = new ArrayList(); - - for( ArgumentSource argumentSource: argumentSources ) { - if(argumentSource.isDeprecated() && argumentMatches.findMatches(this,argumentSource).size() > 0) - notifyDeprecatedCommandLineArgument(argumentSource); - - // If this argument source depends on other command-line arguments, skip it and make a note to process it later. - if(argumentSource.isDependent()) { - dependentArguments.add(argumentSource); - continue; - } - loadValueIntoObject( argumentSource, object, argumentMatches.findMatches(this,argumentSource) ); - } - - for(ArgumentSource dependentArgument: dependentArguments) { - MultiplexArgumentTypeDescriptor dependentDescriptor = dependentArgument.createDependentTypeDescriptor(this,object); - ArgumentSource dependentSource = dependentArgument.copyWithCustomTypeDescriptor(dependentDescriptor); - loadValueIntoObject(dependentSource,object,argumentMatches.findMatches(this,dependentSource)); - } - } - - /** - * Notify the user that tags have been created. - * @param key The key created. - * @param tags List of tags, or empty list if no tags are present. - */ - public void addTags(Object key, final Tags tags) { - this.tags.put(key,tags); - } - - /** - * Gets the tags associated with a given object. - * @param key Key for which to find a tag. - * @return List of tags associated with this key. - */ - public Tags getTags(Object key) { - if(!tags.containsKey(key)) - return new Tags(); - return tags.get(key); - } - - /** - * Add a RodBinding type argument to this parser. Called during parsing to allow - * us to track all of the RodBindings discovered in the command line. - * @param rodBinding the rodbinding to add. Must not be added twice - */ - @Requires("rodBinding != null") - public void addRodBinding(final RodBinding rodBinding) { - rodBindings.add(rodBinding); - } - - /** - * Notify the user that a deprecated command-line argument has been used. - * @param argumentSource Deprecated argument source specified by user. - */ - private void notifyDeprecatedCommandLineArgument(ArgumentSource argumentSource) { - // Grab the first argument definition and report that one as the failure. Theoretically, we should notify of all failures. - List definitions = argumentSource.createArgumentDefinitions(); - if(definitions.size() < 1) - throw new ReviewedStingException("Internal error. Argument source creates no definitions."); - ArgumentDefinition definition = definitions.get(0); - throw new UserException.DeprecatedArgument(definition.fullName,definition.doc); - } - - /** - * Loads a single argument into the object and that objects children. - * @param argumentMatches Argument matches to load into the object. - * @param source Argument source to load into the object. - * @param instance Object into which to inject the value. The target might be in a container within the instance. - */ - private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches ) { - // Nothing to load - if( argumentMatches.size() == 0 && ! source.createsTypeDefault() ) - return; - - // Target instance into which to inject the value. - Collection targets = findTargets( source, instance ); - - // Abort if no home is found for the object. - if( targets.size() == 0 ) - throw new ReviewedStingException("Internal command-line parser error: unable to find a home for argument matches " + argumentMatches); - - for( Object target: targets ) { - Object value = (argumentMatches.size() != 0) ? source.parse(this,argumentMatches) : source.createTypeDefault(this); - - JVMUtils.setFieldValue(source.field,target,value); - } - } - - public Collection getRodBindings() { - return Collections.unmodifiableCollection(rodBindings); - } - - /** - * Gets a collection of the container instances of the given type stored within the given target. - * @param source Argument source. - * @param instance Container. - * @return A collection of containers matching the given argument source. - */ - private Collection findTargets(ArgumentSource source, Object instance) { - LinkedHashSet targets = new LinkedHashSet(); - for( Class clazz = instance.getClass(); clazz != null; clazz = clazz.getSuperclass() ) { - for( Field field: clazz.getDeclaredFields() ) { - if( field.equals(source.field) ) { - targets.add(instance); - } else if( field.isAnnotationPresent(ArgumentCollection.class) ) { - targets.addAll(findTargets(source, JVMUtils.getFieldValue(field, instance))); - } - } - } - return targets; - } - - /** - * Prints out the help associated with these command-line argument definitions. - * @param applicationDetails Details about the specific GATK-based application being run. - */ - public void printHelp( ApplicationDetails applicationDetails ) { - new HelpFormatter().printHelp(applicationDetails,argumentDefinitions); - } - - /** - * Extract all the argument sources from a given object. - * @param sourceClass class to act as sources for other arguments. - * @return A list of sources associated with this object and its aggregated objects. - */ - public List extractArgumentSources(Class sourceClass) { - return extractArgumentSources(sourceClass, new Field[0]); - } - - /** - * Fetch the best command-line argument descriptor for the given class. - * @param type Class for which to specify a descriptor. - * @return descriptor for the given type. - */ - public ArgumentTypeDescriptor selectBestTypeDescriptor(Class type) { - return ArgumentTypeDescriptor.selectBest(argumentTypeDescriptors,type); - } - - private List extractArgumentSources(Class sourceClass, Field[] parentFields) { - // now simply call into the truly general routine extract argument bindings but with a null - // object so bindings aren't computed - Map bindings = extractArgumentBindings(null, sourceClass, parentFields); - return new ArrayList(bindings.keySet()); - } - - public Map extractArgumentBindings(Object obj) { - if ( obj == null ) throw new IllegalArgumentException("Incoming object cannot be null"); - return extractArgumentBindings(obj, obj.getClass(), new Field[0]); - } - - /** - * Extract all the argument sources from a given object, along with their bindings if obj != null . - * @param obj the object corresponding to the sourceClass - * @param sourceClass class to act as sources for other arguments. - * @param parentFields Parent Fields - * @return A map of sources associated with this object and its aggregated objects and bindings to their bindings values - */ - private Map extractArgumentBindings(Object obj, Class sourceClass, Field[] parentFields) { - Map bindings = new LinkedHashMap(); - - while( sourceClass != null ) { - Field[] fields = sourceClass.getDeclaredFields(); - for( Field field: fields ) { - if( ArgumentTypeDescriptor.isArgumentAnnotationPresent(field) ) { - Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; - bindings.put( new ArgumentSource(parentFields, field, selectBestTypeDescriptor(field.getType())), val ); - } - if( field.isAnnotationPresent(ArgumentCollection.class) ) { - Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; - Field[] newParentFields = Arrays.copyOf(parentFields, parentFields.length + 1); - newParentFields[parentFields.length] = field; - bindings.putAll( extractArgumentBindings(val, field.getType(), newParentFields) ); - } - } - - sourceClass = sourceClass.getSuperclass(); - } - - return bindings; - } - - /** - * Determines whether a token looks like the name of an argument. - * @param token Token to inspect. Can be surrounded by whitespace. - * @return True if token is of short name form. - */ - private boolean isArgumentForm( String token ) { - for( ParsingMethod parsingMethod: parsingMethods ) { - if( parsingMethod.matches(token) ) - return true; - } - - return false; - } - - /** - * Parse a short name into an ArgumentMatch. - * @param token The token to parse. The token should pass the isLongArgumentForm test. - * @param position The position of the token in question. - * @return ArgumentMatch associated with this token, or null if no match exists. - */ - private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) { - if( !isArgumentForm(token) ) - throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token ); - - for( ParsingMethod parsingMethod: parsingMethods ) { - if( parsingMethod.matches( token ) ) - return parsingMethod.match( argumentDefinitions, token, position ); - } - - // No parse results found. - return null; - } -} - -/** - * An exception indicating that some required arguments are missing. - */ -class MissingArgumentException extends ArgumentException { - public MissingArgumentException( Collection missingArguments ) { - super( formatArguments(missingArguments) ); - } - - private static String formatArguments( Collection missingArguments ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentDefinition missingArgument: missingArguments ) { - if( missingArgument.shortName != null ) - sb.append( String.format("%nArgument with name '--%s' (-%s) is missing.", missingArgument.fullName, missingArgument.shortName) ); - else - sb.append( String.format("%nArgument with name '--%s' is missing.", missingArgument.fullName) ); - } - return sb.toString(); - } -} - -/** - * An exception for undefined arguments. - */ -class InvalidArgumentException extends ArgumentException { - public InvalidArgumentException( ArgumentMatches invalidArguments ) { - super( formatArguments(invalidArguments) ); - } - - private static String formatArguments( ArgumentMatches invalidArguments ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentMatch invalidArgument: invalidArguments ) - sb.append( String.format("%nArgument with name '%s' isn't defined.", invalidArgument.label) ); - return sb.toString(); - } -} - -/** - * An exception for values whose format is invalid. - */ -class InvalidArgumentValueException extends ArgumentException { - public InvalidArgumentValueException( Collection> invalidArgumentValues ) { - super( formatArguments(invalidArgumentValues) ); - } - - private static String formatArguments( Collection> invalidArgumentValues ) { - StringBuilder sb = new StringBuilder(); - for( Pair invalidValue: invalidArgumentValues ) { - if(invalidValue.getSecond() == null) - sb.append( String.format("%nArgument '--%s' requires a value but none was provided", - invalidValue.first.fullName) ); - else - sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", - invalidValue.first.fullName, - invalidValue.second, - invalidValue.first.validation) ); - } - return sb.toString(); - } -} - - -/** - * An exception for values that can't be mated with any argument. - */ -class UnmatchedArgumentException extends ArgumentException { - public UnmatchedArgumentException( ArgumentMatch invalidValues ) { - super( formatArguments(invalidValues) ); - } - - private static String formatArguments( ArgumentMatch invalidValues ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentMatchSite site: invalidValues.sites.keySet() ) - for( ArgumentMatchValue value: invalidValues.sites.get(site) ) { - switch (site.getSource().getType()) { - case CommandLine: - sb.append( String.format("%nInvalid argument value '%s' at position %d.", - value.asString(), site.getIndex()) ); - break; - case Provider: - sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.", - value.asString(), site.getSource().getDescription(), site.getIndex()) ); - break; - default: - throw new RuntimeException( String.format("Unexpected argument match source type: %s", - site.getSource().getType())); - } - if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString())) - sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace."); - } - return sb.toString(); - } -} - -/** - * An exception indicating that too many values have been provided for the given argument. - */ -class TooManyValuesForArgumentException extends ArgumentException { - public TooManyValuesForArgumentException( Collection arguments ) { - super( formatArguments(arguments) ); - } - - private static String formatArguments( Collection arguments ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentMatch argument: arguments ) - sb.append( String.format("%nArgument '%s' has too many values: %s.", argument.label, Arrays.deepToString(argument.values().toArray())) ); - return sb.toString(); - } -} - -/** - * An exception indicating that mutually exclusive options have been passed in the same command line. - */ -class ArgumentsAreMutuallyExclusiveException extends ArgumentException { - public ArgumentsAreMutuallyExclusiveException( Collection> arguments ) { - super( formatArguments(arguments) ); - } - - private static String formatArguments( Collection> arguments ) { - StringBuilder sb = new StringBuilder(); - for( Pair argument: arguments ) - sb.append( String.format("%nArguments '%s' and '%s' are mutually exclusive.", argument.first.definition.fullName, argument.second.definition.fullName ) ); - return sb.toString(); - } - -} - - -/** - * An exception for when an argument doesn't match an of the enumerated options for that var type - */ -class UnknownEnumeratedValueException extends ArgumentException { - public UnknownEnumeratedValueException(ArgumentDefinition definition, String argumentPassed) { - super( formatArguments(definition,argumentPassed) ); - } - - private static String formatArguments(ArgumentDefinition definition, String argumentPassed) { - return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java deleted file mode 100644 index ef8e01df4..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java +++ /dev/null @@ -1,197 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.Feature; - -import java.util.*; - -/** - * A RodBinding representing a walker argument that gets bound to a ROD track. - * - * The RodBinding is a formal GATK argument that bridges between a walker and - * the RefMetaDataTracker to obtain data about this rod track at runtime. The RodBinding - * is explicitly typed with type of the Tribble.Feature expected to be produced by this - * argument. The GATK Engine takes care of initializing the binding and connecting it - * to the RMD system. - * - * It is recommended that optional RodBindings be initialized to the value returned - * by the static method makeUnbound(). - * - * Note that this class is immutable. - */ -public final class RodBinding { - protected final static String UNBOUND_VARIABLE_NAME = ""; - protected final static String UNBOUND_SOURCE = "UNBOUND"; - protected final static String UNBOUND_TRIBBLE_TYPE = ""; - - /** - * Create an unbound Rodbinding of type. This is the correct programming - * style for an optional RodBinding - * - * At Input() - * RodBinding x = RodBinding.makeUnbound(T.class) - * - * The unbound binding is guaranteed to never match any binding. It uniquely - * returns false to isBound(). - * - * @param type the Class type produced by this unbound object - * @param any class extending Tribble Feature - * @return the UNBOUND RodBinding producing objects of type T - */ - @Requires("type != null") - protected final static RodBinding makeUnbound(Class type) { - return new RodBinding(type); - } - - /** The name of this binding. Often the name of the field itself, but can be overridden on cmdline */ - final private String name; - /** where the data for this ROD is coming from. A file or special value if coming from stdin */ - final private String source; - /** the string name of the tribble type, such as vcf, bed, etc. */ - final private String tribbleType; - /** The command line tags associated with this RodBinding */ - final private Tags tags; - /** The Java class expected for this RodBinding. Must correspond to the type emited by Tribble */ - final private Class type; - /** True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments */ - final private boolean bound; - - /** - * The name counter. This is how we create unique names for collections of RodBindings - * on the command line. If you have provide the GATK with -X file1 and -X file2 to a - * RodBinding argument as List> then each binding will receive automatically - * the name of X and X2. - */ - final private static Map nameCounter = new HashMap(); - - /** for UnitTests */ - final public static void resetNameCounter() { - nameCounter.clear(); - } - - @Requires("rawName != null") - @Ensures("result != null") - final private static synchronized String countedVariableName(final String rawName) { - Integer count = nameCounter.get(rawName); - if ( count == null ) { - nameCounter.put(rawName, 1); - return rawName; - } else { - nameCounter.put(rawName, count + 1); - return rawName + (count + 1); - } - } - - @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) - public RodBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { - this.type = type; - this.name = countedVariableName(rawName); - this.source = source; - this.tribbleType = tribbleType; - this.tags = tags; - this.bound = true; - } - - /** - * For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName - * @param type - * @param rawName - */ - public RodBinding(Class type, final String rawName) { - this(type, rawName, "missing", type.getSimpleName(), new Tags()); - } - - /** - * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object - * @param type class this unbound RodBinding creates - */ - @Requires({"type != null"}) - private RodBinding(Class type) { - this.type = type; - this.name = UNBOUND_VARIABLE_NAME; // special value can never be found in RefMetaDataTracker - this.source = UNBOUND_SOURCE; - this.tribbleType = UNBOUND_TRIBBLE_TYPE; - this.tags = new Tags(); - this.bound = false; - } - - - /** - * @return True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments - */ - final public boolean isBound() { - return bound; - } - - /** - * @return The name of this binding. Often the name of the field itself, but can be overridden on cmdline - */ - @Ensures({"result != null"}) - final public String getName() { - return name; - } - - /** - * @return the string name of the tribble type, such as vcf, bed, etc. - */ - @Ensures({"result != null"}) - final public Class getType() { - return type; - } - - /** - * @return where the data for this ROD is coming from. A file or special value if coming from stdin - */ - @Ensures({"result != null"}) - final public String getSource() { - return source; - } - - /** - * @return The command line tags associated with this RodBinding. Will include the tags used to - * determine the name and type of this RodBinding - */ - @Ensures({"result != null"}) - final public Tags getTags() { - return tags; - } - - /** - * @return The Java class expected for this RodBinding. Must correspond to the type emited by Tribble - */ - @Ensures({"result != null"}) - final public String getTribbleType() { - return tribbleType; - } - - @Override - public String toString() { - return String.format("(RodBinding name=%s source=%s)", getName(), getSource()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java deleted file mode 100644 index 111786e63..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ /dev/null @@ -1,221 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.crypt.CryptUtils; -import org.broadinstitute.sting.utils.crypt.GATKKey; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.ListFileUtils; - -import java.security.PublicKey; -import java.util.*; - -/** - * @author aaron - */ -public abstract class CommandLineExecutable extends CommandLineProgram { - /** - * The actual engine which performs the analysis. - */ - protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - - // get the analysis name - public abstract String getAnalysisName(); - - /** - * Gets the GATK argument bundle. - * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. - */ - protected abstract GATKArgumentCollection getArgumentCollection(); - - /** - * A list of all the arguments initially used as sources. - */ - private final Collection argumentSources = new ArrayList(); - - protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); - - /** - * this is the function that the inheriting class can expect to have called - * when the command line system has initialized. - * - * @return the return code to exit the program with - */ - protected int execute() throws Exception { - engine.setParser(parser); - argumentSources.add(this); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - - try { - // Make sure a valid GATK user key is present, if required. - authorizeGATKRun(); - - engine.setArguments(getArgumentCollection()); - - // File lists can require a bit of additional expansion. Set these explicitly by the engine. - engine.setSAMFileIDs(ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser)); - - engine.setWalker(walker); - walker.setToolkit(engine); - - Collection filters = engine.createFilters(); - engine.setFilters(filters); - - // load the arguments into the walker / filters. - // TODO: The fact that this extra load call exists here when all the parsing happens at the engine - // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive - // TODO: argument processing. - loadArgumentsIntoObject(walker); - argumentSources.add(walker); - - Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); - engine.setReferenceMetaDataFiles(rodBindings); - - for (ReadFilter filter: filters) { - loadArgumentsIntoObject(filter); - argumentSources.add(filter); - } - - engine.execute(); - generateGATKRunReport(walker); - } catch ( Exception e ) { - generateGATKRunReport(walker, e); - throw e; - } - - // always return 0 - return 0; - } - - /** - * Authorizes this run of the GATK by checking for a valid GATK user key, if required. - * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. - */ - private void authorizeGATKRun() { - if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || - getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { - if ( getArgumentCollection().gatkKeyFile == null ) { - throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + - "Please see " + UserException.PHONE_HOME_DOCS_URL + - " for more information and instructions on how to obtain a key."); - } - else { - PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); - GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); - - if ( ! gatkUserKey.isValid() ) { - throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); - } - } - } - } - - /** - * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. - * This report will be written to either STDOUT or to the run repository, depending on the options - * for -et. - * - * @param e the exception, can be null if no exception occurred - */ - private void generateGATKRunReport(Walker walker, Exception e) { - if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { - GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); - report.postReport(getArgumentCollection().phoneHomeType); - } - } - - /** - * Convenience method for fully parameterized generateGATKRunReport when an exception has - * not occurred - * - * @param walker - */ - private void generateGATKRunReport(Walker walker) { - generateGATKRunReport(walker, null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), - new SAMFileWriterArgumentTypeDescriptor(engine,System.out), - new OutputStreamArgumentTypeDescriptor(engine,System.out) ); - } - - /** - * GATK can add arguments dynamically based on analysis type. - * - * @return true - */ - @Override - protected boolean canAddArgumentsDynamically() { - return true; - } - - /** - * GATK provides the walker as an argument source. - * @return List of walkers to load dynamically. - */ - @Override - protected Class[] getArgumentSources() { - // No walker info? No plugins. - if (getAnalysisName() == null) return new Class[] {}; - - Collection argumentSources = new ArrayList(); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - engine.setArguments(getArgumentCollection()); - engine.setWalker(walker); - walker.setToolkit(engine); - argumentSources.add(walker.getClass()); - - Collection filters = engine.createFilters(); - for(ReadFilter filter: filters) - argumentSources.add(filter.getClass()); - - Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; - return argumentSources.toArray(argumentSourcesAsArray); - } - - @Override - protected String getArgumentSourceName( Class argumentSource ) { - return engine.getWalkerName((Class)argumentSource); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java deleted file mode 100644 index 5fc0ccd3e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import net.sf.picard.PicardException; -import net.sf.samtools.SAMException; -import org.broad.tribble.TribbleException; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.walkers.Attribution; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.*; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.util.*; - -/** - * All command line parameters accepted by all tools in the GATK. - * - * The GATK engine itself. Manages map/reduce data access and runs walkers. - * - * We run command line GATK programs using this class. It gets the command line args, parses them, and hands the - * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here, - * the gatk engine should deal with any data related information. - */ -@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) -public class CommandLineGATK extends CommandLineExecutable { - @Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run") - private String analysisName = null; - - // our argument collection, the collection of command line args we accept - @ArgumentCollection - private GATKArgumentCollection argCollection = new GATKArgumentCollection(); - - /** - * Get pleasing info about the GATK. - * - * @return A list of Strings that contain pleasant info about the GATK. - */ - @Override - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(createApplicationHeader(), - getAttribution(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - getAdditionalHelp()); - } - - @Override - public String getAnalysisName() { - return analysisName; - } - - @Override - protected GATKArgumentCollection getArgumentCollection() { - return argCollection; - } - - /** - * Required main method implementation. - */ - public static void main(String[] argv) { - try { - CommandLineGATK instance = new CommandLineGATK(); - start(instance, argv); - System.exit(CommandLineProgram.result); // todo -- this is a painful hack - } catch (UserException e) { - exitSystemWithUserError(e); - } catch (TribbleException e) { - // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are - // lazy loaded, so they aren't caught elsewhere and made into User Exceptions - exitSystemWithUserError(e); - } catch(PicardException e) { - // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? - exitSystemWithError(e); - } catch (SAMException e) { - checkForMaskedUserErrors(e); - exitSystemWithSamError(e); - } catch (OutOfMemoryError e) { - exitSystemWithUserError(new UserException.NotEnoughMemory()); - } catch (Throwable t) { - checkForMaskedUserErrors(t); - exitSystemWithError(t); - } - } - - public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; - public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; - public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; - - private static void checkForMaskedUserErrors(final Throwable t) { - // masked out of memory error - if ( t instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); - // masked user error - if ( t instanceof UserException || t instanceof TribbleException ) - exitSystemWithUserError(new UserException(t.getMessage())); - - // no message means no masked error - final String message = t.getMessage(); - if ( message == null ) - return; - - // too many open files error - if ( message.contains("Too many open files") ) - exitSystemWithUserError(new UserException.TooManyOpenFiles()); - - // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) - exitSystemWithSamError(t); - - // can't close tribble index when writing - if ( message.contains("Unable to close index for") ) - exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); - - // disk is full - if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - - // masked error wrapped in another one - if ( t.getCause() != null ) - checkForMaskedUserErrors(t.getCause()); - } - - /** - * Creates the a short blurb about the GATK, copyright info, and where to get documentation. - * - * @return The application header. - */ - public static List createApplicationHeader() { - List header = new ArrayList(); - header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); - header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); - return header; - } - - public static String getVersionNumber() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - return headerInfo.containsKey("org.broadinstitute.sting.gatk.version") ? headerInfo.getString("org.broadinstitute.sting.gatk.version") : ""; - } - - public static String getBuildTime() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; - } - - /** - * If the user supplied any additional attribution, return it here. - * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. - */ - private List getAttribution() { - List attributionLines = new ArrayList(); - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(analysisName)) { - Class walkerType = walkerManager.getWalkerClassByName(analysisName); - if(walkerType.isAnnotationPresent(Attribution.class)) - attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); - } - return attributionLines; - } - - /** - * Retrieves additional information about GATK walkers. - * the code in HelpFormatter and supply it as a helper to this method. - * - * @return A string summarizing the walkers available in this distribution. - */ - private String getAdditionalHelp() { - String additionalHelp; - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(getAnalysisName())) - additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); - else - additionalHelp = getAllWalkerHelp(); - - return additionalHelp; - } - - private static final int PACKAGE_INDENT = 1; - private static final int WALKER_INDENT = 3; - private static final String FIELD_SEPARATOR = " "; - - private String getWalkerHelp(Class walkerType) { - // Construct a help string to output details on this walker. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - formatter.format("Available Reference Ordered Data types:%n"); - formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); - formatter.format("%n"); - - formatter.format("For a full description of this walker, see its GATKdocs at:%n"); - formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); - - return additionalHelp.toString(); - } - - /** - * Load in additional help information about all available walkers. - * @return A string representation of the additional help. - */ - private String getAllWalkerHelp() { - // Construct a help string to output available walkers. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - // Get the list of walker names from the walker manager. - WalkerManager walkerManager = engine.getWalkerManager(); - - // Build a list sorted by walker display name. As this information is collected, keep track of the longest - // package / walker name for later formatting. - SortedSet helpText = new TreeSet(new HelpEntryComparator()); - - int longestPackageName = 0; - int longestWalkerName = 0; - for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { - // Get the display name. - String packageName = walkersByPackage.getKey(); - String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); - String packageHelpText = walkerManager.getPackageSummaryText(packageName); - - // Compute statistics about which names is longest. - longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); - - SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); - for(Class walkerType: walkersByPackage.getValue()) { - String walkerName = walkerType.getName(); - String walkerDisplayName = walkerManager.getName(walkerType); - String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); - - longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); - - walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); - } - - // Dump the walkers into the sorted set. - helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); - } - - final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); - - - for(HelpEntry packageHelp: helpText) { - printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - for(HelpEntry walkerHelp: packageHelp.children) - printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - // Print a blank line between sets of walkers. - printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); - } - - return additionalHelp.toString(); - } - - private void printDescriptorLine(Formatter formatter, - int headerIndentWidth, - String header, - int headerWidth, - String fieldSeparator, - String description, - int lineWidth) { - final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; - final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; - List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); - - String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; - String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; - String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; - - // Output description line. - formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", - "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); - for(int i = 1; i < wordWrappedText.size(); i++) - formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); - } - -} - -/** - * Represents a given help entry; contains a display name, a summary and optionally some children. - */ -class HelpEntry { - public final String uid; - public final String displayName; - public final String summary; - public final SortedSet children; - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - * @param children children for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary, SortedSet children) { - this.uid = uid; - this.displayName = displayName; - this.summary = summary; - this.children = children; - } - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary) { - this(uid,displayName,summary,null); - } - -} - -/** - * Compare two help entries by display name. - */ -class HelpEntryComparator implements Comparator { - private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); - - /** - * Compares the order of lhs to rhs, not taking case into account. - * @param lhs First object to compare. - * @param rhs Second object to compare. - * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. - */ - public int compare(HelpEntry lhs, HelpEntry rhs) { - if(lhs == null && rhs == null) return 0; - if(lhs == null || lhs.displayName.equals("")) return 1; - if(rhs == null || rhs.displayName.equals("")) return -1; - return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); - } - - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java deleted file mode 100644 index 27b030060..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ /dev/null @@ -1,1232 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.gatk.executive.MicroScheduler; -import org.broadinstitute.sting.gatk.filters.FilterManager; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; -import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.samples.SampleDB; -import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; -import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; -import java.util.concurrent.TimeUnit; - -import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; -import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker; - -/** - * A GenomeAnalysisEngine that runs a specified walker. - */ -public class GenomeAnalysisEngine { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); - public static final long NO_RUNTIME_LIMIT = -1; - - /** - * The GATK command-line argument parsing code. - */ - private ParsingEngine parsingEngine; - - /** - * The genomeLocParser can create and parse GenomeLocs. - */ - private GenomeLocParser genomeLocParser; - - /** - * Accessor for sharded read data. - */ - private SAMDataSource readsDataSource = null; - - /** - * Accessor for sharded reference data. - */ - private ReferenceDataSource referenceDataSource = null; - - /** - * Accessor for sample metadata - */ - private SampleDB sampleDB = null; - - /** - * Accessor for sharded reference-ordered data. - */ - private List rodDataSources; - - // our argument collection - private GATKArgumentCollection argCollection; - - /** - * Collection of intervals used by the engine. - */ - private GenomeLocSortedSet intervals = null; - - /** - * Explicitly assign the interval set to use for this traversal (for unit testing purposes) - * @param intervals set of intervals to use for this traversal - */ - public void setIntervals( GenomeLocSortedSet intervals ) { - this.intervals = intervals; - } - - /** - * Collection of inputs used by the engine. - */ - private Map inputs = new HashMap(); - - /** - * Collection of outputs used by the engine. - */ - private Collection> outputs = new ArrayList>(); - - /** - * Collection of the filters applied to the input data. - */ - private Collection filters; - - /** - * Collection of the read transformers applied to the reads - */ - private List readTransformers; - - /** - * Controls the allocation of threads between CPU vs IO. - */ - private ThreadAllocation threadAllocation; - - private ReadMetrics cumulativeMetrics = null; - - /** - * A currently hacky unique name for this GATK instance - */ - private String myName = "GATK_" + Math.abs(getRandomGenerator().nextInt()); - - /** - * our walker manager - */ - private final WalkerManager walkerManager = new WalkerManager(); - - private Walker walker; - - public void setWalker(Walker walker) { - this.walker = walker; - } - - /** - * The short name of the current GATK walker as a string - * @return a non-null String - */ - public String getWalkerName() { - return getWalkerName(walker.getClass()); - } - - /** - * A processed collection of SAM reader identifiers. - */ - private Collection samReaderIDs = Collections.emptyList(); - - /** - * Set the SAM/BAM files over which to traverse. - * @param samReaderIDs Collection of ids to use during this traversal. - */ - public void setSAMFileIDs(Collection samReaderIDs) { - this.samReaderIDs = samReaderIDs; - } - - /** - * Collection of reference metadata files over which to traverse. - */ - private Collection referenceMetaDataFiles; - - /** - * The threading efficiency monitor we use in the GATK to monitor our efficiency. - * - * May be null if one isn't active, or hasn't be initialized yet - */ - private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * The global progress meter we are using to track our progress through the genome - */ - private ProgressMeter progressMeter = null; - - /** - * Set the reference metadata files to use for this traversal. - * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. - */ - public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { - this.referenceMetaDataFiles = referenceMetaDataFiles; - } - - /** - * The maximum runtime of this engine, in nanoseconds, set during engine initialization - * from the GATKArgumentCollection command line value - */ - private long runtimeLimitInNanoseconds = -1; - - /** - * Static random number generator and seed. - */ - private static final long GATK_RANDOM_SEED = 47382911L; - private static Random randomGenerator = new Random(GATK_RANDOM_SEED); - public static Random getRandomGenerator() { return randomGenerator; } - public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } - public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } - - /** - * Base Quality Score Recalibration helper object - */ - private BQSRArgumentSet bqsrArgumentSet = null; - public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } - public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } - public void setBaseRecalibration(final GATKArgumentCollection args) { - bqsrArgumentSet = new BQSRArgumentSet(args); - } - - /** - * Actually run the GATK with the specified walker. - * - * @return the value of this traversal. - */ - public Object execute() { - // first thing is to make sure the AWS keys can be decrypted - GATKRunReport.checkAWSAreValid(); - - //HeapSizeMonitor monitor = new HeapSizeMonitor(); - //monitor.start(); - setStartTime(new java.util.Date()); - - final GATKArgumentCollection args = this.getArguments(); - - // validate our parameters - if (args == null) { - throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); - } - - // validate our parameters - if (this.walker == null) - throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null."); - - if (args.nonDeterministicRandomSeed) - resetRandomGenerator(System.currentTimeMillis()); - - // if the use specified an input BQSR recalibration table then enable on the fly recalibration - if (args.BQSR_RECAL_FILE != null) - setBaseRecalibration(args); - - // setup the runtime limits - setupRuntimeLimits(args); - - // Determine how the threads should be divided between CPU vs. IO. - determineThreadAllocation(); - - // Prepare the data for traversal. - initializeDataSources(); - - // initialize and validate the interval list - initializeIntervals(); - validateSuppliedIntervals(); - - // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary - validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); - - // initialize sampleDB - initializeSampleDB(); - - // our microscheduler, which is in charge of running everything - MicroScheduler microScheduler = createMicroscheduler(); - threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); - - // create temp directories as necessary - initializeTempDirectory(); - - // create the output streams - initializeOutputStreams(microScheduler.getOutputTracker()); - - // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on - logger.info("Preparing for traversal" + - (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); - Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); - logger.info("Done preparing for traversal"); - - // execute the microscheduler, storing the results - return microScheduler.execute(this.walker, shardStrategy); - - //monitor.stop(); - //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); - - //return result; - } - - /** - * Retrieves an instance of the walker based on the walker name. - * - * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. - * @return An instance of the walker. - */ - public Walker getWalkerByName(String walkerName) { - try { - return walkerManager.createByName(walkerName); - } catch ( UserException e ) { - if ( isDeprecatedWalker(walkerName) ) { - e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); - } - throw e; - } - } - - /** - * Gets the name of a given walker type. - * @param walkerType Type of walker. - * @return Name of the walker. - */ - public String getWalkerName(Class walkerType) { - return walkerManager.getName(walkerType); - } - - public String getName() { - return myName; - } - - /** - * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; - * the caller must handle that directly. - * @return A collection of available filters. - */ - public Collection createFilters() { - final List filters = new LinkedList<>(); - - // First add the user requested filters - if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) - filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); - for(final String filterName: this.getArguments().readFilters) - filters.add(this.getFilterManager().createByName(filterName)); - - // now add the walker default filters. This ordering is critical important if - // users need to apply filters that fix up reads that would be removed by default walker filters - filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); - - return Collections.unmodifiableList(filters); - } - - /** - * Returns a list of active, initialized read transformers - * - * @param walker the walker we need to apply read transformers too - */ - public void initializeReadTransformers(final Walker walker) { - // keep a list of the active read transformers sorted based on priority ordering - List activeTransformers = new ArrayList(); - - final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); - final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; - - final PluginManager pluginManager = new PluginManager(ReadTransformer.class); - - for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { - transformer.initialize(overrideTime, this, walker); - if ( transformer.enabled() ) - activeTransformers.add(transformer); - } - - setReadTransformers(activeTransformers); - } - - public List getReadTransformers() { - return readTransformers; - } - - /* - * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). - * - * @param readTransformers the active read transformers - */ - protected void checkActiveReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new IllegalArgumentException("read transformers cannot be null"); - - ReadTransformer sawMustBeFirst = null; - ReadTransformer sawMustBeLast = null; - - for ( final ReadTransformer r : readTransformers ) { - if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { - if ( sawMustBeFirst != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); - sawMustBeFirst = r; - } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { - if ( sawMustBeLast != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); - sawMustBeLast = r; - } - } - } - - protected void setReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new ReviewedStingException("read transformers cannot be null"); - - // sort them in priority order - Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); - - // make sure we don't have an invalid set of active read transformers - checkActiveReadTransformers(readTransformers); - - this.readTransformers = readTransformers; - } - - /** - * Parse out the thread allocation from the given command-line argument. - */ - private void determineThreadAllocation() { - if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); - if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); - if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, - argCollection.numberOfCPUThreadsPerDataThread, - argCollection.numberOfIOThreads, - argCollection.monitorThreadEfficiency); - } - - public int getTotalNumberOfThreads() { - return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); - } - - - - /** - * Allow subclasses and others within this package direct access to the walker manager. - * @return The walker manager used by this package. - */ - protected WalkerManager getWalkerManager() { - return walkerManager; - } - - /** - * setup a microscheduler - * - * @return a new microscheduler - */ - private MicroScheduler createMicroscheduler() { - // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. - if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && - this.getArguments().referenceFile == null) { - throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); - } - - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); - } - - protected DownsamplingMethod getDownsamplingMethod() { - GATKArgumentCollection argCollection = this.getArguments(); - - DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); - - DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; - method.checkCompatibilityWithWalker(walker); - return method; - } - - protected void setDownsamplingMethod(DownsamplingMethod method) { - argCollection.setDownsamplingMethod(method); - } - - protected boolean includeReadsWithDeletionAtLoci() { - return walker.includeReadsWithDeletionAtLoci(); - } - - /** - * Verifies that the supplied set of reads files mesh with what the walker says it requires, - * and also makes sure that there were no duplicate SAM files specified on the command line. - */ - protected void validateSuppliedReads() { - GATKArgumentCollection arguments = this.getArguments(); - // Check what the walker says is required against what was provided on the command line. - if (WalkerManager.isRequired(walker, DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0)) - throw new ArgumentException("Walker requires reads but none were provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if ((arguments.samFiles != null && arguments.samFiles.size() > 0) && !WalkerManager.isAllowed(walker, DataSource.READS)) - throw new ArgumentException("Walker does not allow reads but reads were provided."); - - // Make sure no SAM files were specified multiple times by the user. - checkForDuplicateSamFiles(); - } - - /** - * Checks whether there are SAM files that appear multiple times in the fully unpacked list of - * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. - */ - protected void checkForDuplicateSamFiles() { - Set encounteredSamFiles = new HashSet(); - Set duplicateSamFiles = new LinkedHashSet(); - - for ( SAMReaderID samFile : samReaderIDs ) { - if ( encounteredSamFiles.contains(samFile) ) { - duplicateSamFiles.add(samFile.getSamFilePath()); - } - else { - encounteredSamFiles.add(samFile); - } - } - - if ( duplicateSamFiles.size() > 0 ) { - throw new UserException("The following BAM files appear multiple times in the list of input files: " + - duplicateSamFiles + " BAM files may be specified at most once."); - } - } - - /** - * Verifies that the supplied reference file mesh with what the walker says it requires. - */ - protected void validateSuppliedReference() { - GATKArgumentCollection arguments = this.getArguments(); - // Check what the walker says is required against what was provided on the command line. - // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. - if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) - throw new ArgumentException("Walker requires a reference but none was provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) - throw new ArgumentException("Walker does not allow a reference but one was provided."); - } - - protected void validateSuppliedIntervals() { - // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. - if(!(walker instanceof ReadWalker)) { - GenomeLocSortedSet intervals = getIntervals(); - if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) - throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); - } - - // If intervals is non-null and empty at this point, it means that the list of intervals to process - // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since - // this was very likely unintentional, the user should be informed of this. Note that this is different - // from the case where intervals == null, which indicates that there were no interval arguments. - if ( intervals != null && intervals.isEmpty() ) { - logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); - } - - // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome - } - - /** - * Get the sharding strategy given a driving data source. - * - * @param readsDataSource readsDataSource - * @param drivingDataSource Data on which to shard. - * @param intervals intervals - * @return the sharding strategy - */ - protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { - ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); - DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; - ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - - // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. - if(!readsDataSource.isEmpty()) { - if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) - throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) - throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - - if(walker instanceof LocusWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } - else if(walker instanceof ActiveRegionWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); - } - else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { - // Apply special validation to read pair walkers. - if(walker instanceof ReadPairWalker) { - if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - } - - if(intervals == null) - return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); - } - else - throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); - } - else { - // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well - // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard - // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] - final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; - if(intervals == null) - return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); - else - return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); - } - } - - protected boolean flashbackData() { - return walker instanceof ReadWalker; - } - - /** - * Create the temp directory if it doesn't exist. - */ - private void initializeTempDirectory() { - File tempDir = new File(System.getProperty("java.io.tmpdir")); - if (!tempDir.exists() && !tempDir.mkdirs()) - throw new UserException.BadTmpDir("Unable to create directory"); - } - - /** - * Initialize the output streams as specified by the user. - * - * @param outputTracker the tracker supplying the initialization data. - */ - private void initializeOutputStreams(OutputTracker outputTracker) { - for (Map.Entry input : getInputs().entrySet()) - outputTracker.addInput(input.getKey(), input.getValue()); - for (Stub stub : getOutputs()) - outputTracker.addOutput(stub); - - outputTracker.prepareWalker(walker, getArguments().strictnessLevel); - } - - public ReferenceDataSource getReferenceDataSource() { - return referenceDataSource; - } - - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * Manage lists of filters. - */ - private final FilterManager filterManager = new FilterManager(); - - private Date startTime = null; // the start time for execution - - public void setParser(ParsingEngine parsingEngine) { - this.parsingEngine = parsingEngine; - } - - /** - * Explicitly set the GenomeLocParser, for unit testing. - * @param genomeLocParser GenomeLocParser to use. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Sets the start time when the execute() function was last called - * @param startTime the start time when the execute() function was last called - */ - protected void setStartTime(Date startTime) { - this.startTime = startTime; - } - - /** - * @return the start time when the execute() function was last called - */ - public Date getStartTime() { - return startTime; - } - - /** - * Setup the intervals to be processed - */ - protected void initializeIntervals() { - intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); - } - - /** - * Add additional, externally managed IO streams for inputs. - * - * @param argumentSource Field into which to inject the value. - * @param value Instance to inject. - */ - public void addInput(ArgumentSource argumentSource, Object value) { - inputs.put(argumentSource, value); - } - - /** - * Add additional, externally managed IO streams for output. - * - * @param stub Instance to inject. - */ - public void addOutput(Stub stub) { - outputs.add(stub); - } - - /** - * Returns the tag associated with a given command-line argument. - * @param key Object for which to inspect the tag. - * @return Tags object associated with the given key, or an empty Tag structure if none are present. - */ - public Tags getTags(Object key) { - return parsingEngine.getTags(key); - } - - protected void initializeDataSources() { - logger.info("Strictness is " + argCollection.strictnessLevel); - - validateSuppliedReference(); - setReferenceDataSource(argCollection.referenceFile); - - validateSuppliedReads(); - initializeReadTransformers(walker); - - readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); - - for (ReadFilter filter : filters) - filter.initialize(this); - - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); - } - - /** - * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or - * need to absolutely positively kill everyone in the room) - * @param dataSource - */ - public void setReadsDataSource(final SAMDataSource dataSource) { - this.readsDataSource = dataSource; - } - - /** - * Entry-point function to initialize the samples database from input data and pedigree arguments - */ - private void initializeSampleDB() { - SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); - sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); - sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); - sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); - sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); - sampleDB = sampleDBBuilder.getFinalSampleDB(); - } - - /** - * Gets a unique identifier for the reader sourcing this read. - * @param read Read to examine. - * @return A unique identifier for the source file of this read. Exception if not found. - */ - public SAMReaderID getReaderIDForRead(final SAMRecord read) { - return getReadsDataSource().getReaderID(read); - } - - /** - * Gets the source file for this read. - * @param id Unique identifier determining which input file to use. - * @return The source filename for this read. - */ - public File getSourceFileForReaderID(final SAMReaderID id) { - return getReadsDataSource().getSAMFile(id); - } - - /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). - * - * @param reads Reads data source. - * @param reference Reference data source. - * @param rods a collection of the reference ordered data tracks - */ - private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { - if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) - return; - - // Compile a set of sequence names that exist in the reference file. - SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); - - if (!reads.isEmpty()) { - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - if (readsDictionary.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; - } - - // compare the reads to the reference - SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, - "reference", referenceDictionary, true, intervals); - } - - for (ReferenceOrderedDataSource rod : rods) - IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); - } - - /** - * Gets a data source for the given set of reads. - * - * @param argCollection arguments - * @param genomeLocParser parser - * @param refReader reader - * @return A data source for the given set of reads. - */ - private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { - DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); - - // Synchronize the method back into the collection so that it shows up when - // interrogating for the downsampling method during command line recreation. - setDownsamplingMethod(downsamplingMethod); - - logger.info(downsamplingMethod); - - if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) - throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); - - boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); - - if (argCollection.keepProgramRecords) - removeProgramRecords = false; - - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; - - final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? - loadSampleRenameMap(argCollection.sampleRenameMappingFile) : - null; - - return new SAMDataSource( - samReaderIDs, - threadAllocation, - argCollection.numberOfBAMFileHandles, - genomeLocParser, - argCollection.useOriginalBaseQualities, - argCollection.strictnessLevel, - argCollection.readBufferSize, - downsamplingMethod, - new ValidationExclusion(Arrays.asList(argCollection.unsafe)), - filters, - readTransformers, - includeReadsWithDeletionAtLoci(), - argCollection.defaultBaseQualities, - removeProgramRecords, - keepReadsInLIBS, - sampleRenameMap); - } - - /** - * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory - * HashMap. This file must consist of lines with two whitespace-separated fields: - * - * absolute_path_to_bam_file new_sample_name - * - * The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample - * renaming feature is being used. - * - * @param sampleRenameMapFile sample rename map file from which to load data - * @return a HashMap containing the contents of the map file, with the keys being the bam file paths and - * the values being the new sample names. - */ - protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { - logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); - - final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); - - try { - for ( final String line : new XReadLines(sampleRenameMapFile) ) { - final String[] tokens = line.split("\\s+"); - - if ( tokens.length != 2 ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", - tokens.length, line)); - } - - final File bamFile = new File(tokens[0]); - final String newSampleName = tokens[1]; - - if ( ! bamFile.isAbsolute() ) { - throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line); - } - - final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags()); - - if ( sampleRenameMap.containsKey(bamID) ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Bam file %s appears more than once", bamFile.getAbsolutePath())); - } - - sampleRenameMap.put(bamID, newSampleName); - } - } - catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); - } - - return sampleRenameMap; - } - - - /** - * Opens a reference sequence file paired with an index. Only public for testing purposes - * - * @param refFile Handle to a reference sequence file. Non-null. - */ - public void setReferenceDataSource(File refFile) { - this.referenceDataSource = new ReferenceDataSource(refFile); - genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); - } - - /** - * Open the reference-ordered data sources. - * - * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. - * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. - * @param genomeLocParser to use when creating and validating GenomeLocs. - * @param validationExclusionType potentially indicate which validations to include / exclude. - * - * @return A list of reference-ordered data sources. - */ - private List getReferenceOrderedDataSources(Collection referenceMetaDataFiles, - SAMSequenceDictionary sequenceDictionary, - GenomeLocParser genomeLocParser, - ValidationExclusion.TYPE validationExclusionType) { - final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, - getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); - - final List dataSources = new ArrayList(); - for (RMDTriplet fileDescriptor : referenceMetaDataFiles) - dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, - builder, - sequenceDictionary, - genomeLocParser, - flashbackData())); - - return dataSources; - } - - /** - * Returns the SAM File Header from the input reads' data source file - * @return the SAM File Header from the input reads' data source file - */ - public SAMFileHeader getSAMFileHeader() { - return readsDataSource.getHeader(); - } - - public boolean lenientVCFProcessing() { - return lenientVCFProcessing(argCollection.unsafe); - } - - public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { - return val == ValidationExclusion.TYPE.ALL - || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; - } - - /** - * Returns the unmerged SAM file header for an individual reader. - * @param reader The reader. - * @return Header for that reader or null if not available. - */ - public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { - return readsDataSource == null ? null : readsDataSource.getHeader(reader); - } - - /** - * Returns an ordered list of the unmerged SAM file headers known to this engine. - * @return list of header for each input SAM file, in command line order - */ - public List getSAMFileHeaders() { - final List headers = new ArrayList(); - for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { - headers.add(getReadsDataSource().getHeader(id)); - } - return headers; - } - - /** - * Gets the master sequence dictionary for this GATK engine instance - * @return a never-null dictionary listing all of the contigs known to this engine instance - */ - public SAMSequenceDictionary getMasterSequenceDictionary() { - return getReferenceDataSource().getReference().getSequenceDictionary(); - } - - /** - * Returns data source object encapsulating all essential info and handlers used to traverse - * reads; header merger, individual file readers etc can be accessed through the returned data source object. - * - * @return the reads data source - */ - public SAMDataSource getReadsDataSource() { - return this.readsDataSource; - } - - /** - * Sets the collection of GATK main application arguments. - * - * @param argCollection the GATK argument collection - */ - public void setArguments(GATKArgumentCollection argCollection) { - this.argCollection = argCollection; - } - - /** - * Gets the collection of GATK main application arguments. - * - * @return the GATK argument collection - */ - public GATKArgumentCollection getArguments() { - return this.argCollection; - } - - /** - * Get the list of intervals passed to the engine. - * @return List of intervals, or null if no intervals are in use - */ - public GenomeLocSortedSet getIntervals() { - return this.intervals; - } - - /** - * Get the list of regions of the genome being processed. If the user - * requested specific intervals, return those, otherwise return regions - * corresponding to the entire genome. Never returns null. - * - * @return a non-null set of intervals being processed - */ - @Ensures("result != null") - public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { - if ( getIntervals() == null ) - // if we don't have any intervals defined, create intervals from the reference itself - return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); - else - return getIntervals(); - } - - /** - * Gets the list of filters employed by this engine. - * @return Collection of filters (actual instances) used by this engine. - */ - public Collection getFilters() { - return this.filters; - } - - /** - * Sets the list of filters employed by this engine. - * @param filters Collection of filters (actual instances) used by this engine. - */ - public void setFilters(Collection filters) { - this.filters = filters; - } - - /** - * Gets the filter manager for this engine. - * @return filter manager for this engine. - */ - protected FilterManager getFilterManager() { - return filterManager; - } - - /** - * Gets the input sources for this engine. - * @return input sources for this engine. - */ - protected Map getInputs() { - return inputs; - } - - /** - * Gets the output stubs for this engine. - * @return output stubs for this engine. - */ - protected Collection> getOutputs() { - return outputs; - } - - /** - * Returns data source objects encapsulating all rod data; - * individual rods can be accessed through the returned data source objects. - * - * @return the rods data sources - */ - public List getRodDataSources() { - return this.rodDataSources; - } - - /** - * Gets cumulative metrics about the entire run to this point. - * Returns a clone of this snapshot in time. - * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is - * owned by the caller; the caller can do with the object what they wish. - */ - public ReadMetrics getCumulativeMetrics() { - // todo -- probably shouldn't be lazy - if ( cumulativeMetrics == null ) - cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); - return cumulativeMetrics; - } - - /** - * Return the global ThreadEfficiencyMonitor, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - // ------------------------------------------------------------------------------------- - // - // code for working with Samples database - // - // ------------------------------------------------------------------------------------- - - public SampleDB getSampleDB() { - return this.sampleDB; - } - - public Map getApproximateCommandLineArguments(Object... argumentProviders) { - return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); - } - - public String createApproximateCommandLineArgumentString(Object... argumentProviders) { - return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); - } - - // ------------------------------------------------------------------------------------- - // - // code for working with progress meter - // - // ------------------------------------------------------------------------------------- - - /** - * Register the global progress meter with this engine - * - * Calling this function more than once will result in an IllegalStateException - * - * @param meter a non-null progress meter - */ - public void registerProgressMeter(final ProgressMeter meter) { - if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); - if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); - - progressMeter = meter; - } - - /** - * Get the progress meter being used by this engine. May be null if no meter has been registered yet - * @return a potentially null pointer to the progress meter - */ - public ProgressMeter getProgressMeter() { - return progressMeter; - } - - /** - * Does the current runtime in unit exceed the runtime limit, if one has been provided? - * - * @return false if not limit was requested or if runtime <= the limit, true otherwise - */ - public boolean exceedsRuntimeLimit() { - if ( progressMeter == null ) - // not yet initialized or not set because of testing - return false; - - final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); - if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); - - if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) - return false; - else { - final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); - return runtime > maxRuntimeNano; - } - } - - /** - * @return the runtime limit in nanoseconds, or -1 if no limit was specified - */ - public long getRuntimeLimitInNanoseconds() { - return runtimeLimitInNanoseconds; - } - - /** - * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds - * as appropriate - * - * @param args the GATKArgumentCollection to retrieve our runtime limits from - */ - private void setupRuntimeLimits(final GATKArgumentCollection args) { - if ( args.maxRuntime == NO_RUNTIME_LIMIT ) - runtimeLimitInNanoseconds = -1; - else if (args.maxRuntime < 0 ) - throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); - else { - runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java deleted file mode 100644 index 08f892f97..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ /dev/null @@ -1,483 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.arguments; - -import net.sf.samtools.SAMFileReader; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -/** - * @author aaron - * @version 1.0 - */ -public class GATKArgumentCollection { - - /* our version number */ - private float versionNumber = 1; - private String description = "GATK Arguments"; - - /** the constructor */ - public GATKArgumentCollection() { - } - - // parameters and their defaults - @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) - public List samFiles = new ArrayList(); - - @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) - public Integer readBufferSize = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // GATKRunReport options - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? AWS is the default, can be NO_ET so nothing is posted to the run repository. Please see " + UserException.PHONE_HOME_DOCS_URL + " for details.", required = false) - public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; - - @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + UserException.PHONE_HOME_DOCS_URL + " for details.", required = false) - public File gatkKeyFile = null; - - /** - * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be - * used to group together runs during later analysis. One use of this capability is to tag runs as GATK - * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. - * - * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find - * meaningful. - */ - @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false) - public String tag = "NA"; - - // -------------------------------------------------------------------------------------------------------------- - // - // General features - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) - public List readFilters = new ArrayList(); - - @ArgumentCollection - public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); - - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) - public File referenceFile = null; - - @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) - public boolean nonDeterministicRandomSeed = false; - - @Hidden - @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.") - public boolean disableDithering = false; - - @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false) - public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; - - @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="The TimeUnit for maxRuntime", required = false) - public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; - - // -------------------------------------------------------------------------------------------------------------- - // - // Downsampling Arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * Reads will be selected randomly to be removed from the pile based on the method described here. - */ - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus", required = false) - public DownsampleType downsamplingType = null; - - @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) - public Double downsampleFraction = null; - - /** - * For locus-based traversals (LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the - * maximum depth of coverage at each locus. For read-based traversals (ReadWalkers), it controls the - * maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use - * much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does - * not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the - * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when - * removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note - * that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling - * algorithm will under some circumstances retain slightly more or less coverage than requested. - */ - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Coverage [integer] to downsample to per locus (for locus walkers) or per alignment start position (for read walkers)", - required = false) - public Integer downsampleCoverage = null; - - /** - * Gets the downsampling method explicitly specified by the user. If the user didn't specify - * a default downsampling mechanism, return the default. - * @return The explicitly specified downsampling mechanism, or the default if none exists. - */ - public DownsamplingMethod getDownsamplingMethod() { - if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) - return null; - - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); - } - - /** - * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. - * @param method The downsampling mechanism. - */ - public void setDownsamplingMethod(DownsamplingMethod method) { - if (method == null) - throw new IllegalArgumentException("method is null"); - - downsamplingType = method.type; - downsampleCoverage = method.toCoverage; - downsampleFraction = method.toFraction; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // BAQ arguments - // - // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) - public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; - - @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) - public double BAQGOP = BAQ.DEFAULT_GOP; - - // -------------------------------------------------------------------------------------------------------------- - // - // quality encoding checking arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is - * simple: we just iterate over all reads and subtract 31 from every quality score. - */ - @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) - public boolean FIX_MISENCODED_QUALS = false; - - @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountering base qualities that are too high and that seemingly indicate a problem with the base quality encoding of the BAM file", required = false) - public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; - - @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) - public Boolean useOriginalBaseQualities = false; - - @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) - public byte defaultBaseQualities = -1; - - // -------------------------------------------------------------------------------------------------------------- - // - // performance log arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * The file name for the GATK performance log output, or null if you don't want to generate the - * detailed performance logging table. This table is suitable for importing into R or any - * other analysis software that can read tsv files - */ - @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) - public File performanceLog = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // BQSR arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool. - * Please be aware that one should only run recalibration with the covariates file created on the same input bam(s). - */ - @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration (intended for use with BaseRecalibrator and PrintReads)") - public File BQSR_RECAL_FILE = null; - - /** - * Turns on the base quantization module. It requires a recalibration report (-BQSR). - * - * A value of 0 here means "do not quantize". - * Any value greater than zero will be used to recalculate the quantization using that many levels. - * Negative values mean that we should quantize using the recalibration report's quantization level. - */ - @Hidden - @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) - public int quantizationLevels = 0; - - /** - * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. - */ - @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "If true, disables printing of base insertion and base deletion tags (with -BQSR)", required=false) - public boolean disableIndelQuals = false; - - /** - * By default, the OQ tag in not emitted when using the -BQSR argument. - */ - @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "If true, enables printing of the OQ tag with the original base qualities (with -BQSR)", required=false) - public boolean emitOriginalQuals = false; - - /** - * Do not modify quality scores less than this value but rather just write them out unmodified in the recalibrated BAM file. - * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. - * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, - * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, - * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. - */ - @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)", required = false) - public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; - - @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "The global Qscore Bayesian prior to use in the BQSR. If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score", required = false) - public double globalQScorePrior = -1.0; - - /** - * For the sake of your data, please only use this option if you know what you are doing. It is absolutely not recommended practice - * to run base quality score recalibration on reduced BAM files. - */ - @Advanced - @Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Do not fail when running base quality score recalibration on a reduced BAM file even though we highly recommend against it", required = false) - public boolean ALLOW_BQSR_ON_REDUCED_BAMS = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // Other utility arguments - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) - public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; - - @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we override the Walker's default and remove program records from the SAM header", required = false) - public boolean removeProgramRecords = false; - - @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false) - public boolean keepProgramRecords = false; - - @Advanced - @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", - doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " + - "each BAM file listed in the mapping file have only a single sample specified in its header (though there " + - "may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " + - "to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.", - required = false) - public File sampleRenameMappingFile = null; - - @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) - public ValidationExclusion.TYPE unsafe; - - @Hidden - @Advanced - @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", - doc = "UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking " + - "when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index " + - "generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it " + - "for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general " + - "because it allows reading from index files without first acquiring a lock.", - required = false) - public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // Multi-threading arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per - * data thread, and act as completely data parallel processing, increasing the memory usage of GATK - * by M data threads. Data threads generally scale extremely effectively, up to 24 cores - */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false) - public Integer numberOfDataThreads = 1; - - /** - * How many CPU threads should be allocated per data thread? Each CPU thread operates the map - * cycle independently, but may run into earlier scaling problems with IO than data threads. Has - * the benefit of not requiring X times as much memory per thread as data threads do, but rather - * only a constant overhead. - */ - @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) - public int numberOfCPUThreadsPerDataThread = 1; - - @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) - @Hidden - public int numberOfIOThreads = 0; - - /** - * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny - * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for - * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. - */ - @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false) - public Boolean monitorThreadEfficiency = false; - - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) - public Integer numberOfBAMFileHandles = null; - - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) - public List readGroupBlackList = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // PED (pedigree) support - // - // -------------------------------------------------------------------------------------------------------------- - - /** - *

Reads PED file-formatted tabular text files describing meta-data about the samples being - * processed in the GATK.

- * - * - * - *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

- * - *
    - *
  • Family ID
  • - *
  • Individual ID
  • - *
  • Paternal ID
  • - *
  • Maternal ID
  • - *
  • Sex (1=male; 2=female; other=unknown)
  • - *
  • Phenotype
  • - *
- * - *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. - * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a - * quantitative trait or an affection status column: GATK will automatically detect which type - * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

- * - *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

- * - *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that - * line will be ignored. Do not start any family IDs with this character therefore.

- * - *

Affection status should be coded:

- * - *
    - *
  • -9 missing
  • - *
  • 0 missing
  • - *
  • 1 unaffected
  • - *
  • 2 affected
  • - *
- * - *

If any value outside of -9,0,1,2 is detected than the samples are assumed - * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely - * represents the missing value.

- * - *

Genotypes (column 7 onwards) cannot be specified to the GATK.

- * - *

For example, here are two individuals (one row = one person):

- * - *
-     *   FAM001  1  0 0  1  2
-     *   FAM001  2  0 0  1  2
-     * 
- * - *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to - * tell the GATK PED parser that the corresponding fields are missing from the ped file.

- * - *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree - * data should clearly indicate so in their arguments and will throw errors if required pedigree - * information is missing.

- */ - @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) - public List pedigreeFiles = Collections.emptyList(); - - /** - * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more - * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString - * as -ped supports - */ - @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) - public List pedigreeStrings = Collections.emptyList(); - - /** - * How strict should we be in parsing the PED files? - */ - @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) - public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; - - // -------------------------------------------------------------------------------------------------------------- - // - // BAM indexing and sharding arguments - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) - @Hidden - public boolean allowIntervalsWithUnindexedBAM = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing BCF2 - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes",required=false) - @Hidden - public boolean generateShadowBCF = false; - // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - // -------------------------------------------------------------------------------------------------------------- - // - // VCF/BCF index parameters - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Specify the Tribble indexing strategy to use for VCFs. - * - * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter - * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter - * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - */ - - @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="which type of IndexCreator to use for VCF/BCF indices",required=false) - @Advanced - public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; - - @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator",required=false) - @Advanced - public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java deleted file mode 100644 index 9dc9734a5..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ /dev/null @@ -1,1170 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.sam.MergingSamRecordIterator; -import net.sf.picard.sam.SamFileHeaderMerger; -import net.sf.samtools.*; -import net.sf.samtools.util.CloseableIterator; -import net.sf.samtools.util.RuntimeIOException; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.ReadMetrics; -import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.downsampling.*; -import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.gatk.iterators.*; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.baq.ReadTransformingIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; -import java.util.*; -import java.util.concurrent.Callable; - -/** - * User: aaron - * Date: Mar 26, 2009 - * Time: 2:36:16 PM - *

- * Converts shards to SAM iterators over the specified region - */ -public class SAMDataSource { - final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); - - /** Backing support for reads. */ - protected final ReadProperties readProperties; - - /** - * Runtime metrics of reads filtered, etc. - */ - private final ReadMetrics readMetrics; - - /** - * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. - */ - protected final GenomeLocParser genomeLocParser; - - /** - * Identifiers for the readers driving this data source. - */ - private final Collection readerIDs; - - /** - * How strict are the readers driving this data source. - */ - private final SAMFileReader.ValidationStringency validationStringency; - - /** - * Do we want to remove the program records from this data source? - */ - private final boolean removeProgramRecords; - - /** - * Store BAM indices for each reader present. - */ - private final Map bamIndices = new HashMap(); - - /** - * The merged header. - */ - private final SAMFileHeader mergedHeader; - - /** - * The constituent headers of the unmerged files. - */ - private final Map headers = new HashMap(); - - /** - * The sort order of the BAM files. Files without a sort order tag are assumed to be - * in coordinate order. - */ - private SAMFileHeader.SortOrder sortOrder = null; - - /** - * Whether the read groups in overlapping files collide. - */ - private final boolean hasReadGroupCollisions; - - /** - * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids - * are always unique, we can simply use a map here, no need to stratify by reader. - */ - private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); - - /** - * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified - * by readers, since there can be readgroup id collision: different bam files (readers) can list the - * same read group id, which will be disambiguated when these input streams are merged. - */ - private final Map originalToMergedReadGroupMappings = new HashMap(); - - /** - * Mapping from bam file ID to new sample name. Used only when doing on-the-fly sample renaming. - */ - private Map sampleRenameMap = null; - - /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(SAMDataSource.class); - - /** - * A collection of readers driving the merging process. - */ - private final SAMResourcePool resourcePool; - - /** - * Asynchronously loads BGZF blocks. - */ - private final BGZFBlockLoadingDispatcher dispatcher; - - /** - * How are threads allocated. - */ - private final ThreadAllocation threadAllocation; - - /** - * Create a new SAM data source given the supplied read metadata. - * - * For testing purposes - * - * @param samFiles list of reads files. - */ - public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { - this( - samFiles, - threadAllocation, - numFileHandles, - genomeLocParser, - false, - SAMFileReader.ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - } - - /** - * See complete constructor. Does not enable BAQ by default. - * - * For testing purposes - */ - public SAMDataSource( - Collection samFiles, - ThreadAllocation threadAllocation, - Integer numFileHandles, - GenomeLocParser genomeLocParser, - boolean useOriginalBaseQualities, - SAMFileReader.ValidationStringency strictness, - Integer readBufferSize, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - boolean includeReadsWithDeletionAtLoci) { - this( samFiles, - threadAllocation, - numFileHandles, - genomeLocParser, - useOriginalBaseQualities, - strictness, - readBufferSize, - downsamplingMethod, - exclusionList, - supplementalFilters, - Collections.emptyList(), - includeReadsWithDeletionAtLoci, - (byte) -1, - false, - false, - null); - } - - /** - * Create a new SAM data source given the supplied read metadata. - * @param samFiles list of reads files. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param strictness Stringency of reads file parsing. - * @param readBufferSize Number of reads to hold in memory per BAM. - * @param downsamplingMethod Method for downsampling reads at a given locus. - * @param exclusionList what safety checks we're willing to let slide - * @param supplementalFilters additional filters to dynamically apply. - * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method - * will explicitly list reads with deletion over the current reference base; otherwise, only observed - * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? - * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. - * Will be null if we're not doing sample renaming. - */ - public SAMDataSource( - Collection samFiles, - ThreadAllocation threadAllocation, - Integer numFileHandles, - GenomeLocParser genomeLocParser, - boolean useOriginalBaseQualities, - SAMFileReader.ValidationStringency strictness, - Integer readBufferSize, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - List readTransformers, - boolean includeReadsWithDeletionAtLoci, - byte defaultBaseQualities, - boolean removeProgramRecords, - final boolean keepReadsInLIBS, - final Map sampleRenameMap) { - - this.readMetrics = new ReadMetrics(); - this.genomeLocParser = genomeLocParser; - - readerIDs = samFiles; - - this.threadAllocation = threadAllocation; - // TODO: Consider a borrowed-thread dispatcher implementation. - if(this.threadAllocation.getNumIOThreads() > 0) { - logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); - dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); - } - else - dispatcher = null; - - validationStringency = strictness; - this.removeProgramRecords = removeProgramRecords; - if(readBufferSize != null) - ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests - else { - // Choose a sensible default for the read buffer size. - // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. - // Now we are simply setting it to 100K reads - ReadShard.setReadBufferSize(100000); - } - - this.sampleRenameMap = sampleRenameMap; - - resourcePool = new SAMResourcePool(Integer.MAX_VALUE); - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Determine the sort order. - for(SAMReaderID readerID: readerIDs) { - if (! readerID.samFile.canRead() ) - throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + - "Please check that the file is present and readable and try again."); - - // Get the sort order, forcing it to coordinate if unsorted. - SAMFileReader reader = readers.getReader(readerID); - SAMFileHeader header = reader.getFileHeader(); - - headers.put(readerID,header); - - if ( header.getReadGroups().isEmpty() ) { - throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, - "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); - } - - SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; - - // Validate that all input files are sorted in the same order. - if(this.sortOrder != null && this.sortOrder != sortOrder) - throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder)); - - // Update the sort order. - this.sortOrder = sortOrder; - } - - mergedHeader = readers.getMergedHeader(); - hasReadGroupCollisions = readers.hasReadGroupCollisions(); - - readProperties = new ReadProperties( - samFiles, - mergedHeader, - sortOrder, - useOriginalBaseQualities, - strictness, - downsamplingMethod, - exclusionList, - supplementalFilters, - readTransformers, - includeReadsWithDeletionAtLoci, - defaultBaseQualities, - keepReadsInLIBS); - - // cache the read group id (original) -> read group id (merged) - // and read group id (merged) -> read group id (original) mappings. - for(SAMReaderID id: readerIDs) { - SAMFileReader reader = readers.getReader(id); - ReadGroupMapping mappingToMerged = new ReadGroupMapping(); - - List readGroups = reader.getFileHeader().getReadGroups(); - for(SAMReadGroupRecord readGroup: readGroups) { - if(hasReadGroupCollisions) { - mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); - mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); - } else { - mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); - mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); - } - } - - originalToMergedReadGroupMappings.put(id,mappingToMerged); - } - - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } - - resourcePool.releaseReaders(readers); - } - - public void close() { - SAMReaders readers = resourcePool.getAvailableReaders(); - for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = readers.getReader(readerID); - reader.close(); - } - } - - /** - * Returns Reads data structure containing information about the reads data sources placed in this pool as well as - * information about how they are downsampled, sorted, and filtered - * @return - */ - public ReadProperties getReadsInfo() { return readProperties; } - - /** - * Checks to see whether any reads files are supplying data. - * @return True if no reads files are supplying data to the traversal; false otherwise. - */ - public boolean isEmpty() { - return readProperties.getSAMReaderIDs().size() == 0; - } - - /** - * Gets the SAM file associated with a given reader ID. - * @param id The reader for which to retrieve the source file. - * @return the file actually associated with the id. - */ - public File getSAMFile(SAMReaderID id) { - return id.samFile; - } - - /** - * Returns readers used by this data source. - * @return A list of SAM reader IDs. - */ - public Collection getReaderIDs() { - return readerIDs; - } - - /** - * Retrieves the id of the reader which built the given read. - * @param read The read to test. - * @return ID of the reader. - */ - public SAMReaderID getReaderID(SAMRecord read) { - return resourcePool.getReaderID(read.getFileSource().getReader()); - } - - /** - * Gets the merged header from the SAM file. - * @return The merged header. - */ - public SAMFileHeader getHeader() { - return mergedHeader; - } - - public SAMFileHeader getHeader(SAMReaderID id) { - return headers.get(id); - } - - /** - * Gets the revised read group id mapped to this 'original' read group id. - * @param reader for which to grab a read group. - * @param originalReadGroupId ID of the original read group. - * @return Merged read group ID. - */ - public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { - return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); - } - - /** - * Gets the original read group id (as it was specified in the original input bam file) that maps onto - * this 'merged' read group id. - * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). - * @return Merged read group ID. - */ - public String getOriginalReadGroupId(final String mergedReadGroupId) { - return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); - } - - /** - * True if all readers have an index. - * @return True if all readers have an index. - */ - public boolean hasIndex() { - return readerIDs.size() == bamIndices.size(); - } - - /** - * Gets the index for a particular reader. Always preloaded. - * @param id Id of the reader. - * @return The index. Will preload the index if necessary. - */ - public GATKBAMIndex getIndex(final SAMReaderID id) { - return bamIndices.get(id); - } - - /** - * Retrieves the sort order of the readers. - * @return Sort order. Can be unsorted, coordinate order, or query name order. - */ - public SAMFileHeader.SortOrder getSortOrder() { - return sortOrder; - } - - /** - * Gets the cumulative read metrics for shards already processed. - * @return Cumulative read metrics. - */ - public ReadMetrics getCumulativeReadMetrics() { - // don't return a clone here because the engine uses a pointer to this object - return readMetrics; - } - - /** - * Incorporate the given read metrics into the cumulative read metrics. - * @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics. - */ - public void incorporateReadMetrics(final ReadMetrics readMetrics) { - this.readMetrics.incrementMetrics(readMetrics); - } - - public StingSAMIterator seek(Shard shard) { - if(shard.buffersReads()) { - return shard.iterator(); - } - else { - return getIterator(shard); - } - } - - /** - * Gets the reader associated with the given read. - * @param readers Available readers. - * @param read - * @return - */ - private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { - for(SAMReaderID id: getReaderIDs()) { - if(readers.getReader(id) == read.getFileSource().getReader()) - return id; - } - throw new ReviewedStingException("Unable to find id for reader associated with read " + read.getReadName()); - } - - /** - * Get the initial reader positions across all BAM files - * - * @return the start positions of the first chunk of reads for all BAM files - */ - protected Map getInitialReaderPositions() { - Map initialPositions = new HashMap(); - SAMReaders readers = resourcePool.getAvailableReaders(); - - for ( SAMReaderID id: getReaderIDs() ) { - initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); - } - - resourcePool.releaseReaders(readers); - return initialPositions; - } - - /** - * Get an iterator over the data types specified in the shard. - * - * @param shard The shard specifying the data limits. - * @return An iterator over the selected data. - */ - protected StingSAMIterator getIterator( Shard shard ) { - return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); - } - - /** - * Get an iterator over the data types specified in the shard. - * @param readers Readers from which to load data. - * @param shard The shard specifying the data limits. - * @param enableVerification True to verify. For compatibility with old sharding strategy. - * @return An iterator over the selected data. - */ - private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { - // Set up merging to dynamically merge together multiple BAMs. - Map> iteratorMap = new HashMap>(); - - for(SAMReaderID id: getReaderIDs()) { - CloseableIterator iterator = null; - - // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. - // TODO: Kill this check once we've proven that the design elements are gone. - if(shard.getFileSpans().get(id) == null) - throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); - - try { - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); - BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); - codec.setInputStream(inputStream); - iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); - } - else { - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); - } - } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes - throw new UserException.MalformedBAM(id.samFile, e.getMessage()); - } - - iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); - if(shard.getGenomeLocs().size() > 0) - iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); - - iteratorMap.put(readers.getReader(id), iterator); - } - - MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); - - // The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's - // metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when - // we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen. - return applyDecoratingIterators(readMetrics, - enableVerification, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getReadTransformers(), - readProperties.defaultBaseQualities(), - shard instanceof LocusShard); - } - - private class BAMCodecIterator implements CloseableIterator { - private final BlockInputStream inputStream; - private final SAMFileReader reader; - private final BAMRecordCodec codec; - private SAMRecord nextRead; - - private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { - this.inputStream = inputStream; - this.reader = reader; - this.codec = codec; - advance(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if(!hasNext()) - throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); - SAMRecord currentRead = nextRead; - advance(); - return currentRead; - } - - public void close() { - // NO-OP. - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); - } - - private void advance() { - final long startCoordinate = inputStream.getFilePointer(); - nextRead = codec.decode(); - final long stopCoordinate = inputStream.getFilePointer(); - - if(reader != null && nextRead != null) - PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate)))); - } - } - - /** - * Filter reads based on user-specified criteria. - * - * @param readMetrics metrics to track when using this iterator. - * @param enableVerification Verify the order of reads. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param wrappedIterator the raw data source. - * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. - * @param supplementalFilters additional filters to apply to the reads. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard - * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. - */ - protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, - boolean enableVerification, - boolean useOriginalBaseQualities, - StingSAMIterator wrappedIterator, - Boolean noValidationOfReadOrder, - Collection supplementalFilters, - List readTransformers, - byte defaultBaseQualities, - boolean isLocusBasedTraversal ) { - - // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, - // this will consolidate the cigar strings into canonical form. This has to be done before the read - // filtering, because not all read filters will behave correctly with things like zero-length cigar - // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also - // modify the base qualities. - wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - - // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads - // that actually survive filtering. Otherwise we could get much less coverage than requested. - wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - - // Downsampling: - - // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers - // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding - // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling - // of individual reads. - boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && - readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readProperties.getDownsamplingMethod().toCoverage != null; - - // Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be - // doing any downsampling downstream of us - if ( ! assumeDownstreamLIBSDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); - } - - // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, - // verify the read ordering by applying a sort order iterator - if (!noValidationOfReadOrder && enableVerification) - wrappedIterator = new VerifyingSamIterator(wrappedIterator); - - // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded - // by the read filters or downsampler. - for ( final ReadTransformer readTransformer : readTransformers ) { - if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) - wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); - } - - return wrappedIterator; - } - - protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod() == null || - readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { - return wrappedIterator; - } - - if ( readProperties.getDownsamplingMethod().toFraction != null ) { - - // If we're downsampling to a fraction of reads, there's no point in paying the cost of - // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on - // reads from each sample separately, since the result would be the same as running the - // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator - // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling - // was requested. - - return new DownsamplingReadsIterator(wrappedIterator, - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); - } - else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { - - // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling - // the read stream to run the downsampler on the reads for each individual sample separately if - // BY_SAMPLE downsampling was requested. - - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - return new PerSampleDownsamplingReadsIterator(wrappedIterator, - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); - } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - return new DownsamplingReadsIterator(wrappedIterator, - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); - } - } - - return wrappedIterator; - } - - - private class SAMResourcePool { - /** - * How many entries can be cached in this resource pool? - */ - private final int maxEntries; - - /** - * All iterators of this reference-ordered data. - */ - private List allResources = new ArrayList(); - - /** - * All iterators that are not currently in service. - */ - private List availableResources = new ArrayList(); - - public SAMResourcePool(final int maxEntries) { - this.maxEntries = maxEntries; - } - - /** - * Choose a set of readers from the pool to use for this query. When complete, - * @return - */ - public synchronized SAMReaders getAvailableReaders() { - if(availableResources.size() == 0) - createNewResource(); - SAMReaders readers = availableResources.get(0); - availableResources.remove(readers); - return readers; - } - - public synchronized void releaseReaders(SAMReaders readers) { - if(!allResources.contains(readers)) - throw new ReviewedStingException("Tried to return readers from the pool that didn't originate in the pool."); - availableResources.add(readers); - } - - /** - * Gets the reader id for the given reader. - * @param reader Reader for which to determine the id. - * @return id of the given reader. - */ - protected synchronized SAMReaderID getReaderID(SAMFileReader reader) { - for(SAMReaders readers: allResources) { - SAMReaderID id = readers.getReaderID(reader); - if(id != null) - return id; - } - throw new ReviewedStingException("No such reader id is available"); - } - - private synchronized void createNewResource() { - if(allResources.size() > maxEntries) - throw new ReviewedStingException("Cannot create a new resource pool. All resources are in use."); - SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); - allResources.add(readers); - availableResources.add(readers); - } - - } - - /** - * A collection of readers derived from a reads metadata structure. - */ - private class SAMReaders implements Iterable { - /** - * Cached representation of the merged header used to generate a merging iterator. - */ - private final SamFileHeaderMerger headerMerger; - - /** - * Internal storage for a map of id -> reader. - */ - private final Map readers = new LinkedHashMap(); - - /** - * The inptu streams backing - */ - private final Map inputStreams = new LinkedHashMap(); - - /** - * Derive a new set of readers from the Reads metadata. - * @param readerIDs reads to load. - * TODO: validationStringency is not used here - * @param validationStringency validation stringency. - * @param removeProgramRecords indicate whether to clear program records from the readers - */ - public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency, boolean removeProgramRecords) { - final int totalNumberOfFiles = readerIDs.size(); - int readerNumber = 1; - final SimpleTimer timer = new SimpleTimer().start(); - - if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); - final int tickSize = 50; - int nExecutedTotal = 0; - long lastTick = timer.currentTime(); - for(final SAMReaderID readerID: readerIDs) { - final ReaderInitializer init = new ReaderInitializer(readerID).call(); - - if (removeProgramRecords) { - init.reader.getFileHeader().setProgramRecords(new ArrayList()); - } - - if (threadAllocation.getNumIOThreads() > 0) { - inputStreams.put(init.readerID, init.blockInputStream); // get from initializer - } - - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); - readers.put(init.readerID,init.reader); - if ( ++nExecutedTotal % tickSize == 0) { - double tickInSec = (timer.currentTime() - lastTick) / 1000.0; - printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); - lastTick = timer.currentTime(); - } - } - - if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); - - Collection headers = new LinkedList(); - - // Examine the bam headers, perform any requested sample renaming on them, and add - // them to the list of headers to pass to the Picard SamFileHeaderMerger: - for ( final Map.Entry readerEntry : readers.entrySet() ) { - final SAMReaderID readerID = readerEntry.getKey(); - final SAMFileReader reader = readerEntry.getValue(); - final SAMFileHeader header = reader.getFileHeader(); - - // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, - // or the user's sample rename map file didn't contain an entry for this bam file: - final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID) : null; - - // If we've been asked to rename the sample for this bam file, do so now. We'll check to - // make sure this bam only contains reads from one sample before proceeding. - // - // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of - // the existing read group attributes (including sample name) when merging - // headers, regardless of whether there are read group collisions or not. - if ( remappedSampleName != null ) { - remapSampleName(readerID, header, remappedSampleName); - } - - headers.add(header); - } - - headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); - - // update all read groups to GATKSAMRecordReadGroups - final List gatkReadGroups = new LinkedList(); - for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { - gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); - } - headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); - } - - /** - * Changes the sample name in the read groups for the provided bam file header to match the - * remappedSampleName. Blows up with a UserException if the header contains more than one - * sample name. - * - * @param readerID ID for the bam file from which the provided header came from - * @param header The bam file header. Will be modified by this call. - * @param remappedSampleName New sample name to replace the existing sample attribute in the - * read groups for the header. - */ - private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { - String firstEncounteredSample = null; - - for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { - final String thisReadGroupSample = readGroup.getSample(); - - if ( thisReadGroupSample == null ) { - throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + - "bam file contains a read group (id: %s) with a null sample attribute", - readerID.getSamFilePath(), readGroup.getId())); - } - else if ( firstEncounteredSample == null ) { - firstEncounteredSample = thisReadGroupSample; - } - else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { - throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + - "however this bam file contains reads from more than one sample " + - "(encountered samples %s and %s in the bam header). The GATK requires that " + - "all bams for which on-the-fly sample renaming is requested " + - "contain reads from only a single sample per bam.", - readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); - } - - readGroup.setSample(remappedSampleName); - } - } - - final private void printReaderPerformance(final int nExecutedTotal, - final int nExecutedInTick, - final int totalNumberOfFiles, - final SimpleTimer timer, - final double tickDurationInSec) { - final int pendingSize = totalNumberOfFiles - nExecutedTotal; - final double totalTimeInSeconds = timer.getElapsedTime(); - final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); - final int nRemaining = pendingSize; - final double estTimeToComplete = pendingSize / nTasksPerSecond; - logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", - nExecutedInTick, tickDurationInSec, - nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, - nRemaining, estTimeToComplete, estTimeToComplete / 60)); - } - - /** - * Return the header derived from the merging of these BAM files. - * @return the merged header. - */ - public SAMFileHeader getMergedHeader() { - return headerMerger.getMergedHeader(); - } - - /** - * Do multiple read groups collide in this dataset? - * @return True if multiple read groups collide; false otherwis. - */ - public boolean hasReadGroupCollisions() { - return headerMerger.hasReadGroupCollisions(); - } - - /** - * Get the newly mapped read group ID for the given read group. - * @param readerID Reader for which to discern the transformed ID. - * @param originalReadGroupID Original read group. - * @return Remapped read group. - */ - public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { - SAMFileHeader header = readers.get(readerID).getFileHeader(); - return headerMerger.getReadGroupId(header,originalReadGroupID); - } - - /** - * Creates a new merging iterator from the given map, with the given header. - * @param iteratorMap A map of readers to iterators. - * @return An iterator which will merge those individual iterators. - */ - public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { - return new MergingSamRecordIterator(headerMerger,iteratorMap,true); - } - - /** - * Retrieve the reader from the data structure. - * @param id The ID of the reader to retrieve. - * @return the reader associated with the given id. - */ - public SAMFileReader getReader(SAMReaderID id) { - if(!readers.containsKey(id)) - throw new NoSuchElementException("No reader is associated with id " + id); - return readers.get(id); - } - - /** - * Retrieve the input stream backing a reader. - * @param id The ID of the reader to retrieve. - * @return the reader associated with the given id. - */ - public BlockInputStream getInputStream(final SAMReaderID id) { - return inputStreams.get(id); - } - - /** - * Searches for the reader id of this reader. - * @param reader Reader for which to search. - * @return The id associated the given reader, or null if the reader is not present in this collection. - */ - protected SAMReaderID getReaderID(SAMFileReader reader) { - for(Map.Entry entry: readers.entrySet()) { - if(reader == entry.getValue()) - return entry.getKey(); - } - // Not found? return null. - return null; - } - - /** - * Returns an iterator over all readers in this structure. - * @return An iterator over readers. - */ - public Iterator iterator() { - return readers.values().iterator(); - } - - /** - * Returns whether any readers are present in this structure. - * @return - */ - public boolean isEmpty() { - return readers.isEmpty(); - } - } - - class ReaderInitializer implements Callable { - final SAMReaderID readerID; - BlockInputStream blockInputStream = null; - SAMFileReader reader; - - public ReaderInitializer(final SAMReaderID readerID) { - this.readerID = readerID; - } - - public ReaderInitializer call() { - final File indexFile = findIndexFile(readerID.samFile); - try { - if (threadAllocation.getNumIOThreads() > 0) - blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(readerID.samFile,indexFile,false); - } catch ( RuntimeIOException e ) { - throw new UserException.CouldNotReadInputFile(readerID.samFile, e); - } catch ( SAMFormatException e ) { - throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); - } - // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). - // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, - // just in case we want to change this behavior later. - catch ( RuntimeException e ) { - throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); - } - reader.setSAMRecordFactory(factory); - reader.enableFileSource(true); - reader.setValidationStringency(validationStringency); - return this; - } - } - - private class ReleasingIterator implements StingSAMIterator { - /** - * The resource acting as the source of the data. - */ - private final SAMReaders resource; - - /** - * The iterator to wrap. - */ - private final StingSAMIterator wrappedIterator; - - public ReleasingIterator(SAMReaders resource, StingSAMIterator wrapped) { - this.resource = resource; - this.wrappedIterator = wrapped; - } - - public ReleasingIterator iterator() { - return this; - } - - public void remove() { - throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); - } - - public void close() { - wrappedIterator.close(); - resourcePool.releaseReaders(resource); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public SAMRecord next() { - return wrappedIterator.next(); - } - } - - /** - * Maps read groups in the original SAMFileReaders to read groups in - */ - private class ReadGroupMapping extends HashMap {} - - /** - * Locates the index file alongside the given BAM, if present. - * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. - * @param bamFile The data file to use. - * @return A File object if the index file is present; null otherwise. - */ - private File findIndexFile(File bamFile) { - File indexFile; - - try { - Class bamFileReaderClass = Class.forName("net.sf.samtools.BAMFileReader"); - Method indexFileLocator = bamFileReaderClass.getDeclaredMethod("findIndexFile",File.class); - indexFileLocator.setAccessible(true); - indexFile = (File)indexFileLocator.invoke(null,bamFile); - } - catch(ClassNotFoundException ex) { - throw new ReviewedStingException("Unable to locate BAMFileReader class, used to check for index files"); - } - catch(NoSuchMethodException ex) { - throw new ReviewedStingException("Unable to locate Picard index file locator."); - } - catch(IllegalAccessException ex) { - throw new ReviewedStingException("Unable to access Picard index file locator."); - } - catch(InvocationTargetException ex) { - throw new ReviewedStingException("Unable to invoke Picard index file locator."); - } - - return indexFile; - } - - /** - * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream - * will be as granular as possible given our current knowledge of the best ways to split up BAM files. - * @return An iterator that spans all reads in all BAM files. - */ - public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); - return shardBalancer; - } - - /** - * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any - * read that has been assigned - * - * @param shardBalancer shard balancer object - * @return non-null initialized version of the shard balancer - */ - public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); - return shardBalancer; - } - - /** - * Create a schedule for processing the initialized BAM file using the given interval list. - * The returned schedule should be as granular as possible. - * @param intervals The list of intervals for which to create the schedule. - * @return A granular iterator over file pointers. - */ - public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { - if(intervals == null) - throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); - shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); - return shardBalancer; - } -} - - - diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java deleted file mode 100644 index fb7a16bfd..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ /dev/null @@ -1,399 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.DefaultHashMap; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.*; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -import org.apache.log4j.Logger; - -public class AlleleBiasedDownsamplingUtils { - - // define this class so that we can use Java generics below - private final static class PileupElementList extends ArrayList {} - - /** - * Computes an allele biased version of the given pileup - * - * @param pileup the original pileup - * @param downsamplingFraction the fraction of total reads to remove per allele - * @return allele biased pileup - */ - public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { - // special case removal of all or no reads - if ( downsamplingFraction <= 0.0 ) - return pileup; - if ( downsamplingFraction >= 1.0 ) - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - - final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new PileupElementList(); - - // start by stratifying the reads by the alleles they represent at this position - boolean sawReducedRead = false; - for ( final PileupElement pe : pileup ) { - if ( pe.getRead().isReducedRead() ) - sawReducedRead = true; - - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } - - // make a listing of allele counts and calculate the total count - final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead); - final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); - - // do smart down-sampling - final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor - final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); - - final HashSet readsToRemove = new HashSet(numReadsToRemove); - for ( int i = 0; i < 4; i++ ) { - final PileupElementList alleleList = alleleStratifiedElements[i]; - // if we don't need to remove any reads, then don't - if ( alleleCounts[i] > targetAlleleCounts[i] ) - readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); - } - - // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise - final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); - for ( final PileupElement pe : pileup ) { - if ( !readsToRemove.contains(pe) ) { - readsToKeep.add(pe); - } - } - - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); - } - - /** - * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) - * - * @param alleleStratifiedElements pileup elements stratified by allele - * @param sawReducedRead is at least one read a reduced read? - * @return non-null int array representing allele counts - */ - private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) { - final int[] alleleCounts = new int[alleleStratifiedElements.length]; - for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { - if ( !sawReducedRead ) { - alleleCounts[i] = alleleStratifiedElements[i].size(); - } else { - for ( final PileupElement pe : alleleStratifiedElements[i] ) - alleleCounts[i] += pe.getRepresentativeCount(); - } - } - return alleleCounts; - } - - private static int scoreAlleleCounts(final int[] alleleCounts) { - if ( alleleCounts.length < 2 ) - return 0; - - // sort the counts (in ascending order) - final int[] alleleCountsCopy = alleleCounts.clone(); - Arrays.sort(alleleCountsCopy); - - final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; - final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; - - int remainderCount = 0; - for ( int i = 0; i < alleleCounts.length - 2; i++ ) - remainderCount += alleleCountsCopy[i]; - - // try to get the best score: - // - in the het case the counts should be equal with nothing else - // - in the hom case the non-max should be zero - return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); - } - - /** - * Computes an allele biased version of the allele counts for a given pileup - * - * @param alleleCounts the allele counts for the original pileup - * @param numReadsToRemove number of total reads to remove per allele - * @return non-null array of new counts needed per allele - */ - protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { - final int numAlleles = alleleCounts.length; - - int maxScore = scoreAlleleCounts(alleleCounts); - int[] alleleCountsOfMax = alleleCounts; - - final int numReadsToRemovePerAllele = numReadsToRemove / 2; - - for ( int i = 0; i < numAlleles; i++ ) { - for ( int j = i; j < numAlleles; j++ ) { - final int[] newCounts = alleleCounts.clone(); - - // split these cases so we don't lose on the floor (since we divided by 2) - if ( i == j ) { - newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); - } else { - newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); - newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); - } - - final int score = scoreAlleleCounts(newCounts); - - if ( score < maxScore ) { - maxScore = score; - alleleCountsOfMax = newCounts; - } - } - } - - return alleleCountsOfMax; - } - - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param elements original list of pileup elements - * @param originalElementCount original count of elements (taking reduced reads into account) - * @param numElementsToRemove the number of records to remove - * @return the list of pileup elements TO REMOVE - */ - protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { - // are there no elements to remove? - if ( numElementsToRemove == 0 ) - return Collections.emptyList(); - - final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - - // should we remove all of the elements? - if ( numElementsToRemove >= originalElementCount ) { - elementsToRemove.addAll(elements); - return elementsToRemove; - } - - // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(originalElementCount); - for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - int currentBitSetIndex = 0; - for ( final PileupElement element : elements ) { - - final int representativeCount = element.getRepresentativeCount(); - - // if it's a reduced read, we need to be smart about how we down-sample - if ( representativeCount > 1 ) { - // count how many bits are set over the span represented by this read - int setBits = 0; - for ( int i = 0; i < representativeCount; i++ ) - setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0; - - // remove that count from the count of the reduced read - if ( setBits == representativeCount ) - elementsToRemove.add(element); - else - element.adjustRepresentativeCount(-1 * setBits); - } - // otherwise it's trivial: remove if the corresponding bit is set - else if ( itemsToRemove.get(currentBitSetIndex++) ) { - elementsToRemove.add(element); - } - } - - return elementsToRemove; - } - - /** - * Computes reads to remove based on an allele biased down-sampling - * - * @param alleleReadMap original list of records per allele - * @param downsamplingFraction the fraction of total reads to remove per allele - * @return list of reads TO REMOVE from allele biased down-sampling - */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { - int totalReads = 0; - for ( final List reads : alleleReadMap.values() ) - totalReads += reads.size(); - - int numReadsToRemove = (int)(totalReads * downsamplingFraction); - - // make a listing of allele counts - final List alleles = new ArrayList(alleleReadMap.keySet()); - alleles.remove(Allele.NO_CALL); // ignore the no-call bin - final int numAlleles = alleles.size(); - - // TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead - final int[] alleleCounts = new int[numAlleles]; - for ( int i = 0; i < numAlleles; i++ ) - alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); - - // do smart down-sampling - final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); - - final List readsToRemove = new ArrayList(numReadsToRemove); - for ( int i = 0; i < numAlleles; i++ ) { - if ( alleleCounts[i] > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); - } - } - - return readsToRemove; - } - - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param reads original list of records - * @param numElementsToRemove the number of records to remove - * @return the list of pileup elements TO REMOVE - */ - protected static List downsampleElements(final List reads, final int numElementsToRemove) { - // are there no elements to remove? - if ( numElementsToRemove == 0 ) - return Collections.emptyList(); - - final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - final int originalElementCount = reads.size(); - - // should we remove all of the elements? - if ( numElementsToRemove >= originalElementCount ) { - elementsToRemove.addAll(reads); - return elementsToRemove; - } - - // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(originalElementCount); - for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - int currentBitSetIndex = 0; - for ( final GATKSAMRecord read : reads ) { - if ( read.isReducedRead() ) - throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords"); - - if ( itemsToRemove.get(currentBitSetIndex++) ) - elementsToRemove.add(read); - } - - return elementsToRemove; - } - - /** - * Create sample-contamination maps from file - * - * @param ContaminationFractionFile Filename containing two columns: SampleID and Contamination - * @param AvailableSampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking - * @param logger for logging output - * @return sample-contamination Map - */ - - public static DefaultHashMap loadContaminationFile(File ContaminationFractionFile, final Double defaultContaminationFraction, final Set AvailableSampleIDs, Logger logger) throws StingException { - DefaultHashMap sampleContamination = new DefaultHashMap(defaultContaminationFraction); - Set nonSamplesInContaminationFile = new HashSet(sampleContamination.keySet()); - try { - - XReadLines reader = new XReadLines(ContaminationFractionFile, true); - for (String line : reader) { - - if (line.length() == 0) { - continue; - } - - StringTokenizer st = new StringTokenizer(line,"\t"); - - String fields[] = new String[2]; - try { - fields[0] = st.nextToken(); - fields[1] = st.nextToken(); - } catch(NoSuchElementException e){ - throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); - } - if(st.hasMoreTokens()) { - throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); - } - - if (fields[0].length() == 0 || fields[1].length() == 0) { - throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line); - } - - if (sampleContamination.containsKey(fields[0])) { - throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + fields[0]); - } - - try { - final Double contamination = Double.valueOf(fields[1]); - if (contamination < 0 || contamination > 1){ - throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line); - } - if (AvailableSampleIDs==null || AvailableSampleIDs.contains(fields[0])) {// only add samples if they are in the sampleSet (or if it is null) - sampleContamination.put(fields[0], contamination); - } - else { - nonSamplesInContaminationFile.add(fields[0]); - } - } catch (NumberFormatException e) { - throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line); - } - } - - - //output to the user info lines telling which samples are in the Contamination File - if (sampleContamination.size() > 0) { - logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString())); - - //output to the user info lines telling which samples are NOT in the Contamination File - if(AvailableSampleIDs!=null){ - Set samplesNotInContaminationFile = new HashSet(AvailableSampleIDs); - samplesNotInContaminationFile.removeAll(sampleContamination.keySet()); - if (samplesNotInContaminationFile.size() > 0) - logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString())); - } - } - - //output to the user Samples that do not have lines in the Contamination File - if (nonSamplesInContaminationFile.size() > 0) { - logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString())); - } - - return sampleContamination; - - } catch (IOException e) { - throw new StingException("I/O Error while reading sample-contamination file " + ContaminationFractionFile.getName() + ": " + e.getMessage()); - } - - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java deleted file mode 100644 index 466ade1ed..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ /dev/null @@ -1,172 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Collection; -import java.util.List; - -/** - * The basic downsampler API, with no reads-specific operations. - * - * Downsamplers that extend this class rather than the ReadsDownsampler class can handle - * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a - * PerSampleDownsamplingReadsIterator. - * - * @author David Roazen - */ -public abstract class Downsampler { - - /** - * Number of items discarded by this downsampler since the last call to resetStats() - */ - protected int numDiscardedItems = 0; - - /** - * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine - * immediately whether the item survives the downsampling process, while others will need to see - * more items before making that determination. - * - * @param item the individual item to submit to the downsampler for consideration - */ - public abstract void submit( final T item ); - - /** - * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling - * submit() on each individual item in the collection. - * - * @param items the collection of items to submit to the downsampler for consideration - */ - public void submit( final Collection items ) { - if ( items == null ) { - throw new IllegalArgumentException("submitted items must not be null"); - } - - for ( final T item : items ) { - submit(item); - } - } - - /** - * Are there items that have survived the downsampling process waiting to be retrieved? - * - * @return true if this downsampler has > 0 finalized items, otherwise false - */ - public abstract boolean hasFinalizedItems(); - - /** - * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. - * - * @return a list of all finalized items this downsampler contains, or an empty list if there are none - */ - public abstract List consumeFinalizedItems(); - - /** - * Are there items stored in this downsampler that it doesn't yet know whether they will - * ultimately survive the downsampling process? - * - * @return true if this downsampler has > 0 pending items, otherwise false - */ - public abstract boolean hasPendingItems(); - - /** - * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) - * - * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), - * or null if there are none - */ - public abstract T peekFinalized(); - - /** - * Peek at the first pending item stored in this downsampler (or null if there are no pending items) - * - * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), - * or null if there are none - */ - public abstract T peekPending(); - - /** - * Get the current number of items in this downsampler - * - * This should be the best estimate of the total number of elements that will come out of the downsampler - * were consumeFinalizedItems() to be called immediately after this call. In other words it should - * be number of finalized items + estimate of number of pending items that will ultimately be included as well. - * - * @return a positive integer - */ - public abstract int size(); - - /** - * Returns the number of items discarded (so far) during the downsampling process - * - * @return the number of items that have been submitted to this downsampler and discarded in the process of - * downsampling - */ - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - - /** - * Used to tell the downsampler that no more items will be submitted to it, and that it should - * finalize any pending items. - */ - public abstract void signalEndOfInput(); - - /** - * Empty the downsampler of all finalized/pending items - */ - public abstract void clearItems(); - - /** - * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items - */ - public void resetStats() { - numDiscardedItems = 0; - } - - /** - * Indicates whether an item should be excluded from elimination during downsampling. By default, - * all items representing reduced reads are excluded from downsampling, but individual downsamplers - * may override if they are able to handle reduced reads correctly. Downsamplers should check - * the return value of this method before discarding an item. - * - * @param item The item to test - * @return true if the item should not be subject to elimination during downsampling, otherwise false - */ - protected boolean doNotDiscardItem( final Object item ) { - // Use getClass() rather than instanceof for performance reasons. Ugly but fast. - if ( item.getClass() == GATKSAMRecord.class ) { - return ((GATKSAMRecord)item).isReducedRead(); - } - else if ( item.getClass() == AlignmentStateMachine.class ) { - return ((AlignmentStateMachine)item).isReducedRead(); - } - - return false; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java deleted file mode 100644 index 7077db49c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ /dev/null @@ -1,463 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.executive; - -import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.ReadMetrics; -import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.traversals.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; -import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; - -import javax.management.JMException; -import javax.management.MBeanServer; -import javax.management.ObjectName; -import java.io.File; -import java.lang.management.ManagementFactory; -import java.util.*; - - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Apr 26, 2009 - * Time: 12:37:23 PM - * - * General base class for all scheduling algorithms - * Shards and schedules data in manageable chunks. - * - * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary - * because in the HMS case you have multiple threads executing a traversal engine independently, and - * these engines may need to create separate resources for efficiency or implementation reasons. For example, - * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. - * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have - * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler - * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler - * can properly shut them all down when the scheduling is done. - * - */ -public abstract class MicroScheduler implements MicroSchedulerMBean { - protected static final Logger logger = Logger.getLogger(MicroScheduler.class); - - /** - * The list of all Traversal engines we've created in this micro scheduler - */ - final List allCreatedTraversalEngines = new LinkedList(); - - /** - * All available engines. Engines are borrowed and returned when a subclass is actually - * going to execute the engine on some data. This allows us to have N copies for - * N data parallel executions, but without the dangerous code of having local - * ThreadLocal variables. - */ - final LinkedList availableTraversalEngines = new LinkedList(); - - /** - * Engines that have been allocated to a key already. - */ - final HashMap allocatedTraversalEngines = new HashMap(); - - /** - * Counts the number of instances of the class that are currently alive. - */ - private static int instanceNumber = 0; - - /** - * The engine invoking this scheduler. - */ - protected final GenomeAnalysisEngine engine; - - protected final IndexedFastaSequenceFile reference; - - private final SAMDataSource reads; - protected final Collection rods; - - private final MBeanServer mBeanServer; - private final ObjectName mBeanName; - - /** - * Threading efficiency monitor for tracking the resource utilization of the GATK - * - * may be null - */ - ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * MicroScheduler factory function. Create a microscheduler appropriate for reducing the - * selected walker. - * - * @param walker Which walker to use. - * @param reads the informations associated with the reads - * @param reference the reference file - * @param rods the rods to include in the traversal - * @param threadAllocation Number of threads to utilize. - * - * @return The best-fit microscheduler. - */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if ( threadAllocation.isRunningInParallelMode() ) { - logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + - "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", - threadAllocation.getTotalNumThreads(), - threadAllocation.getNumCPUThreadsPerDataThread(), - threadAllocation.getNumDataThreads(), - Runtime.getRuntime().availableProcessors())); - if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) - logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + - "available processors on this machine %d", threadAllocation.getTotalNumThreads(), - Runtime.getRuntime().availableProcessors())); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - if (walker.isReduceByInterval()) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - - if ( ! (walker instanceof TreeReducible) ) { - throw badNT("nt", engine, walker); - } - } - - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { - throw badNT("nct", engine, walker); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } - } - - private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue(parallelArg, - String.format("The analysis %s currently does not support parallel execution with %s. " + - "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); - } - - /** - * Create a microscheduler given the reads and reference. - * - * @param walker the walker to execute with - * @param reads The reads. - * @param reference The reference. - * @param rods the rods to include in the traversal - * @param threadAllocation the allocation of threads to use in the underlying traversal - */ - protected MicroScheduler(final GenomeAnalysisEngine engine, - final Walker walker, - final SAMDataSource reads, - final IndexedFastaSequenceFile reference, - final Collection rods, - final ThreadAllocation threadAllocation) { - this.engine = engine; - this.reads = reads; - this.reference = reference; - this.rods = rods; - - final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; - - // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, - // and adds it to the list of created engines for later shutdown. - for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { - final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); - allCreatedTraversalEngines.add(traversalEngine); - availableTraversalEngines.add(traversalEngine); - } - - // Create the progress meter, and register it with the analysis engine - engine.registerProgressMeter(new ProgressMeter(progressLogFile, - availableTraversalEngines.peek().getTraversalUnits(), - engine.getRegionsOfGenomeBeingProcessed())); - - // Now that we have a progress meter, go through and initialize the traversal engines - for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) - traversalEngine.initialize(engine, walker, engine.getProgressMeter()); - - // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. - // To get around this limitation and since we have no job identifier at this point, register a simple counter that - // will count the number of instances of this object that have been created in this JVM. - int thisInstance = instanceNumber++; - mBeanServer = ManagementFactory.getPlatformMBeanServer(); - try { - mBeanName = new ObjectName("org.broadinstitute.sting.gatk.executive:type=MicroScheduler,instanceNumber="+thisInstance); - mBeanServer.registerMBean(this, mBeanName); - } - catch (JMException ex) { - throw new ReviewedStingException("Unable to register microscheduler with JMX", ex); - } - } - - /** - * Really make us a traversal engine of the appropriate type for walker and thread allocation - * - * @return a non-null uninitialized traversal engine - */ - @Ensures("result != null") - private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { - if (walker instanceof ReadWalker) { - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof LocusWalker) { - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof DuplicateWalker) { - return new TraverseDuplicates(); - } else if (walker instanceof ReadPairWalker) { - return new TraverseReadPairs(); - } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); - } else { - throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } - } - - - /** - * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - /** - * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses - * - * @param threadEfficiencyMonitor - */ - public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { - this.threadEfficiencyMonitor = threadEfficiencyMonitor; - } - - /** - * Should we stop all execution work and exit gracefully? - * - * Returns true in the case where some external signal or time limit has been received, indicating - * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown - * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler - * examine this value as often as reasonable and, if it returns true, stop what they are doing - * at the next available opportunity, shutdown their resources, call notify done, and return. - * - * @return true if we should abort execution, or false otherwise - */ - protected boolean abortExecution() { - final boolean abort = engine.exceedsRuntimeLimit(); - if ( abort ) { - final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); - logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); - } - return abort; - } - - /** - * Walks a walker over the given list of intervals. - * - * @param walker Computation to perform over dataset. - * @param shardStrategy A strategy for sharding the data. - * - * @return the return type of the walker - */ - public abstract Object execute(Walker walker, Iterable shardStrategy); - - /** - * Tells this MicroScheduler that the execution of one of the subclass of this object as started - * - * Must be called when the implementation of execute actually starts up - * - * Currently only starts the progress meter timer running, but other start up activities could be incorporated - */ - protected void startingExecution() { - engine.getProgressMeter().start(); - } - - /** - * Retrieves the object responsible for tracking and managing output. - * @return An output tracker, for loading data in and extracting results. Will not be null. - */ - public abstract OutputTracker getOutputTracker(); - - /** - * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. - * @param shard the shard to use when querying reads. - * @return an iterator over the reads specified in the shard. - */ - protected StingSAMIterator getReadIterator(Shard shard) { - return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); - } - - /** - * Must be called by subclasses when execute is done - */ - protected void executionIsDone() { - engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); - printReadFilteringStats(); - shutdownTraversalEngines(); - - // Print out the threading efficiency of this HMS, if state monitoring is enabled - if ( threadEfficiencyMonitor != null ) { - // include the master thread information - threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); - threadEfficiencyMonitor.printUsageInformation(logger); - } - } - - /** - * Shutdown all of the created engines, and clear the list of created engines, dropping - * pointers to the traversal engines - */ - public synchronized void shutdownTraversalEngines() { - for ( final TraversalEngine te : allCreatedTraversalEngines) - te.shutdown(); - - allCreatedTraversalEngines.clear(); - availableTraversalEngines.clear(); - } - - /** - * Prints out information about number of reads observed and filtering, if any reads were used in the traversal - * - * Looks like: - * - * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter - * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter - */ - private void printReadFilteringStats() { - final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); - - for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - } - } - - /** - * Gets the engine that created this microscheduler. - * @return The engine owning this microscheduler. - */ - public GenomeAnalysisEngine getEngine() { return engine; } - - /** - * Returns data source maintained by this scheduler - * @return - */ - public SAMDataSource getSAMDataSource() { return reads; } - - /** - * Returns the reference maintained by this scheduler. - * @return The reference maintained by this scheduler. - */ - public IndexedFastaSequenceFile getReference() { return reference; } - - protected void cleanup() { - try { - mBeanServer.unregisterMBean(mBeanName); - } - catch (JMException ex) { - throw new ReviewedStingException("Unable to unregister microscheduler with JMX", ex); - } - } - - /** - * Returns a traversal engine suitable for use, associated with key - * - * Key is an arbitrary object that is used to retrieve the same traversal - * engine over and over. This can be important in the case where the - * traversal engine has data associated with it in some other context, - * and we need to ensure that the context always sees the same traversal - * engine. This happens in the HierarchicalMicroScheduler, where you want - * the a thread executing traversals to retrieve the same engine each time, - * as outputs are tracked w.r.t. that engine. - * - * If no engine is associated with key yet, pops the next available engine - * from the available ones maintained by this - * microscheduler. Note that it's a runtime error to pop a traversal engine - * from this scheduler if there are none available. Callers that - * once pop'd an engine for use must return it with returnTraversalEngine - * - * @param key the key to associate with this engine - * @return a non-null TraversalEngine suitable for execution in this scheduler - */ - @Ensures("result != null") - protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { - if ( key == null ) throw new IllegalArgumentException("key cannot be null"); - - final TraversalEngine engine = allocatedTraversalEngines.get(key); - if ( engine == null ) { - if ( availableTraversalEngines.isEmpty() ) - throw new IllegalStateException("no traversal engines were available"); - allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); - return allocatedTraversalEngines.get(key); - } else { - return engine; - } - } - - /** - * Return a borrowed traversal engine to this MicroScheduler, for later use - * in another traversal execution - * - * @param key the key used to id the engine, provided to the borrowTraversalEngine function - * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. - */ - protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { - if ( traversalEngine == null ) - throw new IllegalArgumentException("Attempting to push a null traversal engine"); - if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) - throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); - if ( ! allocatedTraversalEngines.containsKey(key) ) - throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); - - // note there's nothing to actually do here, but a function implementation - // might want to do something - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java deleted file mode 100644 index 5a1b015fe..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ /dev/null @@ -1,468 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.refdata; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.*; - -/** - * This class represents the Reference Metadata available at a particular site in the genome. It can be - * used to conveniently lookup the RMDs at this site, as well just getting a list of all of the RMDs - * - * The standard interaction model is: - * - * Traversal system arrives at a site, which has a bunch of RMDs covering it - * Traversal passes creates a tracker and passes it to the walker - * walker calls get(rodBinding) to obtain the RMDs values at this site for the track - * associated with rodBinding. - * - * Note that this is an immutable class. Once created the underlying data structures - * cannot be modified - * - * User: mdepristo - * Date: Apr 3, 2009 - * Time: 3:05:23 PM - */ -public class RefMetaDataTracker { - // TODO: this should be a list, not a bindings, actually - private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); - - final Map bindings; - final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); - public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); - - // ------------------------------------------------------------------------------------------ - // - // - // Special ENGINE interaction functions - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Create an tracker with no bindings - */ - public RefMetaDataTracker() { - bindings = Collections.emptyMap(); - } - - public RefMetaDataTracker(final Collection allBindings) { - // set up the bindings - if ( allBindings.isEmpty() ) - bindings = Collections.emptyMap(); - else { - final Map tmap = new HashMap(allBindings.size()); - for ( RODRecordList rod : allBindings ) { - if ( rod != null && ! rod.isEmpty() ) - tmap.put(canonicalName(rod.getName()), rod); - } - - // ensure that no one modifies the bindings itself - bindings = Collections.unmodifiableMap(tmap); - } - } - - // ------------------------------------------------------------------------------------------ - // - // - // Generic accessors - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Gets all of the Tribble features spanning this locus, returning them as a list of specific - * type T extending Feature. This function looks across all tracks to find the Features, so - * if you have two tracks A and B each containing 1 Feature, then getValues will return - * a list containing both features. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. If you want - * to get all Features without any danger of such an exception use the root Tribble - * interface Feature. - * - * @param type The type of the underlying objects bound here - * @param as above - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"type != null"}) - @Ensures("result != null") - public List getValues(final Class type) { - return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); - } - - /** - * Provides the same functionality as @link #getValues(Class) but will only include - * Features that start as the GenomeLoc provide onlyAtThisLoc. - * - * @param type The type of the underlying objects bound here - * @param onlyAtThisLoc - * @param as above - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"type != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { - return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); - } - - /** - * Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting - * elements of the list to return. That is, if there would be two elements in the result of - * @link #getValues(Class), one of these two is selected, and which one it will be isn't - * specified. Consequently, this method is only really safe if (1) you absolutely know - * that only one binding will meet the constraints of @link #getValues(Class) or (2) - * you truly don't care which of the multiple bindings available you are going to examine. - * - * If there are no bindings here, getFirstValue() return null - * - * @param type The type of the underlying objects bound here - * @param as above - * @return A random single element the RODs bound here, or null if none are bound. - */ - @Requires({"type != null"}) - public T getFirstValue(final Class type) { - return safeGetFirst(getValues(type)); - } - - /** - * Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list - * of eligible Features and @link #getFirstValue(Class) to select a single - * element from the interval list. - * - * @param type The type of the underlying objects bound here - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound. - */ - @Requires({"type != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(getValues(type, onlyAtThisLoc)); - - } - - /** - * Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as - * a list of specific type T extending Feature. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBinding != null"}) - @Ensures("result != null") - public List getValues(final RodBinding rodBinding) { - return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false); - } - - /** - * Gets all of the Tribble features bound to any RodBinding in rodBindings, - * spanning this locus, returning them as a list of specific type T extending Feature. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null"}) - @Ensures("result != null") - public List getValues(final Collection> rodBindings) { - List results = new ArrayList(1); - for ( RodBinding rodBinding : rodBindings ) - results.addAll(getValues(rodBinding)); - return results; - } - - /** - * The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { - return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false); - } - - /** - * The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { - List results = new ArrayList(1); - for ( RodBinding rodBinding : rodBindings ) - results.addAll(getValues(rodBinding, onlyAtThisLoc)); - return results; - } - - /** - * Uses the same logic as @getValues(RodBinding) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param as above - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBinding != null"}) - public T getFirstValue(final RodBinding rodBinding) { - return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true)); - } - - /** - * Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true)); - } - - /** - * Uses the same logic as @getValues(List) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched - * @param as above - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBindings != null"}) - public T getFirstValue(final Collection> rodBindings) { - for ( RodBinding rodBinding : rodBindings ) { - T val = getFirstValue(rodBinding); - if ( val != null ) - return val; - } - return null; - } - - /** - * Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { - for ( RodBinding rodBinding : rodBindings ) { - T val = getFirstValue(rodBinding, onlyAtThisLoc); - if ( val != null ) - return val; - } - return null; - } - - /** - * Is there a binding at this site to a ROD/track with the specified name? - * - * @param rodBinding the rod binding we want to know about - * @return true if any Features are bound in this tracker to rodBinding - */ - @Requires({"rodBinding != null"}) - public boolean hasValues(final RodBinding rodBinding) { - return bindings.containsKey(canonicalName(rodBinding.getName())); - } - - /** - * Get all of the RMD tracks at the current site. Each track is returned as a single compound - * object (RODRecordList) that may contain multiple RMD records associated with the current site. - * - * @return List of all tracks - */ - public List getBoundRodTracks() { - return new ArrayList(bindings.values()); - } - - /** - * The number of tracks with at least one value bound here - * @return the number of tracks with at least one bound Feature - */ - public int getNTracksWithBoundFeatures() { - return bindings.size(); - } - - // ------------------------------------------------------------------------------------------ - // Protected accessors using strings for unit testing - // ------------------------------------------------------------------------------------------ - - protected boolean hasValues(final String name) { - return bindings.containsKey(canonicalName(name)); - } - - protected List getValues(final Class type, final String name) { - return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); - } - - protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { - return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); - } - - protected T getFirstValue(final Class type, final String name) { - return safeGetFirst(getValues(type, name)); - } - - protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(getValues(type, name, onlyAtThisLoc)); - } - - // ------------------------------------------------------------------------------------------ - // - // - // Private utility functions - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Helper function for getFirst() operations that takes a list of and - * returns the first element, or null if no such element exists. - * - * @param l - * @param - * @return - */ - @Requires({"l != null"}) - private T safeGetFirst(final List l) { - return l.isEmpty() ? null : l.get(0); - } - - private List addValues(final Collection names, - final Class type, - List values, - final GenomeLoc curLocation, - final boolean requireStartHere, - final boolean takeFirstOnly ) { - for ( String name : names ) { - RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match - values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly ); - if ( takeFirstOnly && ! values.isEmpty() ) - break; - } - - return values; - } - - - - private List addValues(final String name, - final Class type, - List values, - final RODRecordList rodList, - final GenomeLoc curLocation, - final boolean requireStartHere, - final boolean takeFirstOnly ) { - for ( GATKFeature rec : rodList ) { - if ( ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart() ) { // ok, we are going to keep this thing - Object obj = rec.getUnderlyingObject(); - if (!(type.isAssignableFrom(obj.getClass()))) - throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString() - + " it's of type " + obj.getClass()); - - T objT = (T)obj; - if ( takeFirstOnly ) { - if ( values == null ) - values = Arrays.asList(objT); - else - values.add(objT); - - break; - } else { - if ( values == null ) - values = new ArrayList(); - values.add(objT); - } - } - } - - return values == null ? Collections.emptyList() : values; - } - - /** - * Finds the reference metadata track named 'name' and returns all ROD records from that track associated - * with the current site as a RODRecordList List object. If no data track with specified name is available, - * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up - * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and - * location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution, - * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: - * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, - * regardless of the presence of "extended" RODs overlapping with that location). - * @param name track name - * @return track data for the given rod - */ - private RODRecordList getTrackDataByName(final String name) { - final String luName = canonicalName(name); - RODRecordList l = bindings.get(luName); - return l == null ? EMPTY_ROD_RECORD_LIST : l; - } - - private RODRecordList getTrackDataByName(final RodBinding binding) { - return getTrackDataByName(binding.getName()); - } - - /** - * Returns the canonical name of the rod name (lowercases it) - * @param name the name of the rod - * @return canonical name of the rod - */ - private String canonicalName(final String name) { - // todo -- remove me after switch to RodBinding syntax - return name.toLowerCase(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java deleted file mode 100644 index e0b5dd4cb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broad.tribble.index.Index; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; - -import java.util.LinkedHashSet; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -/** - * Utilities for working with Sequence Dictionaries embedded in tribble indices - * - * @author Your Name - * @since Date created - */ -public class IndexDictionaryUtils { - private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); - - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { - LinkedHashSet seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - - public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - index.addProperty(contig,length); - } - } - - public static void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict, - final ValidationExclusion.TYPE validationExclusionType ) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java deleted file mode 100644 index 4c50cfaae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ /dev/null @@ -1,416 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.Tribble; -import org.broad.tribble.TribbleException; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.file.FSLockWithShared; -import org.broadinstitute.sting.utils.instrumentation.Sizeof; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; - - -/** - * - * @author aaron - * ` - * Class RMDTrackBuilder - * - * This class keeps track of the available codecs, and knows how to put together a track of - * that gets iterators from the FeatureReader using Tribble. - * - */ -public class RMDTrackBuilder { // extends PluginManager { - /** - * our log, which we use to capture anything from this class - */ - private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - - // private sequence dictionary we use to set our tracks with - private final SAMSequenceDictionary dict; - - /** - * Private genome loc parser to use when building out new locs. - */ - private final GenomeLocParser genomeLocParser; - - /** - * Validation exclusions, for validating the sequence dictionary. - */ - private ValidationExclusion.TYPE validationExclusionType; - - private final FeatureManager featureManager; - - // If true, do not attempt to create index files if they don't exist or are outdated, and don't - // make any file lock acquisition calls on the index files. - private final boolean disableAutoIndexCreation; - - /** - * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally - * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, - * please talk through your approach with the SE team. - * @param dict Sequence dictionary to use. - * @param genomeLocParser Location parser to use. - * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. - * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. - * UNSAFE in general (because it causes us not to lock index files before reading them) -- - * suitable only for test suite use. - */ - public RMDTrackBuilder(final SAMSequenceDictionary dict, - final GenomeLocParser genomeLocParser, - final ValidationExclusion.TYPE validationExclusionType, - final boolean disableAutoIndexCreation) { - this.dict = dict; - this.validationExclusionType = validationExclusionType; - this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); - this.disableAutoIndexCreation = disableAutoIndexCreation; - } - - /** - * Return the feature manager this RMDTrackBuilder is using the create tribble tracks - * - * @return - */ - public FeatureManager getFeatureManager() { - return featureManager; - } - - /** - * create a RMDTrack of the specified type - * - * @param fileDescriptor a description of the type of track to build. - * - * @return an instance of the track - */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { - String name = fileDescriptor.getName(); - File inputFile = new File(fileDescriptor.getFile()); - - FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); - if (descriptor == null) - throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); - - // return a feature reader track - Pair pair; - if (inputFile.getAbsolutePath().endsWith(".gz")) - pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); - else - pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); - if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); - return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name)); - } - - /** - * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. - * @param codecClass Type of Tribble codec class to build. - * @param inputFile Input file type to use. - * @return An RMDTrack, suitable for accessing reference metadata. - */ - public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { - final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); - - if (descriptor == null) - throw new ReviewedStingException("Unable to find type name for codec class " + codecClass.getName()); - - return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); - } - - /** - * create a feature reader, without assuming there exists an index. This code assumes the feature - * reader of the appropriate type will figure out what the right index type is, and determine if it - * exists. - * - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the track - * @param inputFile the file to load - * @return a feature reader implementation - */ - private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { - // we might not know the index type, try loading with the default reader constructor - logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); - try { - return new Pair(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)),null); - } catch (TribbleException e) { - throw new UserException(e.getMessage(), e); - } - } - - /** - * add a name to the codec, if it takes one - * @param descriptor the class to create a codec for - * @param name the name to assign this codec - * @return the feature codec itself - */ - private FeatureCodec createCodec(FeatureManager.FeatureDescriptor descriptor, String name) { - return featureManager.createCodec(descriptor, name, genomeLocParser); - } - - /** - * create a feature source object given: - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the codec - * @param inputFile the tribble file to parse - * @param storageType How the RMD is streamed into the input file. - * @return the input file as a FeatureReader - */ - private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, - String name, - File inputFile, - RMDStorageType storageType) { - // Feature source and sequence dictionary to use as the ultimate reference - AbstractFeatureReader featureSource = null; - SAMSequenceDictionary sequenceDictionary = null; - - // Detect whether or not this source should be indexed. - boolean canBeIndexed = (storageType == RMDStorageType.FILE); - - if(canBeIndexed) { - try { - Index index = loadIndex(inputFile, createCodec(descriptor, name)); - try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } - catch (ReviewedStingException e) { } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - - // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match - if (sequenceDictionary.size() == 0 && dict != null) { - validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); - - if ( ! disableAutoIndexCreation ) { - File indexFile = Tribble.indexFile(inputFile); - try { // re-write the index - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); - } - } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - } - - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), index); - } - catch (TribbleException e) { - throw new UserException(e.getMessage()); - } - catch (IOException e) { - throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); - } - } - else { - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), false); - } - - return new Pair(featureSource,sequenceDictionary); - } - - /** - * create an index for the input file - * @param inputFile the input file - * @param codec the codec to use - * @return a linear index for the specified type - * @throws IOException if we cannot write the index file - */ - public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { - final File indexFile = Tribble.indexFile(inputFile); - final FSLockWithShared lock = new FSLockWithShared(indexFile); - Index idx = null; - - // If the index file exists and is readable, attempt to load it from disk. We'll get null back - // if a problem was discovered with the index file when it was inspected, and we'll get an - // in-memory index back in the case where the index file could not be locked. - if (indexFile.canRead()) { - idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode - : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); - } - - // If we have an index, it means we either loaded it from disk without issue or we created an in-memory - // index due to not being able to acquire a lock. - if (idx != null) return idx; - - // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index - idx = createIndexInMemory(inputFile, codec); - if ( ! disableAutoIndexCreation ) { - writeIndexToDisk(idx, indexFile, lock); - } - return idx; - } - - /** - * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if - * a lock could not be obtained. Returns null if a problem was discovered with the index file when it - * was examined (eg., it was out-of-date). - * - * @param inputFile the input file - * @param codec the codec to read from - * @param indexFile the index file itself - * @param lock the lock file - * @return an index, or null if we couldn't load one - * @throws IOException if we fail for FS issues - */ - protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { - boolean locked = false; - Index idx = null; - - try { - locked = lock.sharedLock(); - - if ( ! locked ) { // can't lock file - logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", - indexFile.getAbsolutePath())); - idx = createIndexInMemory(inputFile, codec); - } - else { - idx = loadFromDisk(inputFile, indexFile); - } - } finally { - if (locked) lock.unlock(); - } - return idx; - } - - /** - * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) - * @param inputFile the input file - * @param indexFile the input file, plus the index extension - * @return an Index, or null if we're unable to load - */ - protected Index loadFromDisk( final File inputFile, final File indexFile ) { - logger.info("Loading Tribble index from disk for file " + inputFile); - Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - - // check if the file is up-to date (filestamp and version check) - if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) - return index; - else if (indexFile.lastModified() < inputFile.lastModified()) - logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable - logger.warn("Index file " + indexFile + " is out of date (old version), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - - if ( ! disableAutoIndexCreation ) { - boolean deleted = indexFile.delete(); - if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); - } - - return null; - } - - - /** - * attempt to write the index to disk - * @param index the index to write to disk - * @param indexFile the index file location - * @param lock the locking object - * @throws IOException when unable to create the new index - */ - private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { - if ( disableAutoIndexCreation ) { - return; - } - - boolean locked = false; - - try { - locked = lock.exclusiveLock(); - - if (locked) { - logger.info("Writing Tribble index to disk for file " + indexFile); - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - } - else // we can't write it to disk, just store it in memory, tell them this - logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); - - try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } - catch ( ReviewedStingException e) { } - } - finally { - if (locked) lock.unlock(); - } - - } - - /** - * create the index in memory, given the input file and feature codec - * @param inputFile the input file - * @param codec the codec - * @return a LinearIndex, given the file location - * @throws IOException when unable to create the index in memory - */ - protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { - // this can take a while, let them know what we're doing - logger.info("Creating Tribble index in memory for file " + inputFile); - Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); - return idx; - } - - /** - * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. - * (that each contig in the index is in the sequence dictionary). - * @param inputFile for proper error message formatting. - * @param dict the sequence dictionary - * @param index the index file - */ - public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { - if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); - - // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); - validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); - - // actually update the dictionary in the index - IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); - } - - public void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict ) { - IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java deleted file mode 100644 index e30965925..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java +++ /dev/null @@ -1,397 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.coverage; - -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pileup.PileupElement; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; - - -/** - * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - *

- *

- * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system - * considers the coverage at each locus and emits either a per base state or a summary interval BED file that - * partitions the genomic intervals into the following callable states: - *

- *
REF_N
- *
the reference base was an N, which is not considered callable the GATK
- *
PASS
- *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
- *
NO_COVERAGE
- *
absolutely no reads were seen at this locus, regardless of the filtering parameters
- *
LOW_COVERAGE
- *
there were less than min. depth bases at the locus, after applying filters
- *
EXCESSIVE_COVERAGE
- *
more than -maxDepth read at the locus, indicating some sort of mapping problem
- *
POOR_MAPPING_QUALITY
- *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
- *
- *

- *

- *

Input

- *

- * A BAM file containing exactly one sample. - *

- *

- *

Output

- *

- *

    - *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • - *
  • -summary: a table of callable status x count of all examined bases
  • - *
- *

- *

- *

Examples

- *
- *  java -jar GenomeAnalysisTK.jar \
- *     -T CallableLoci \
- *     -I my.bam \
- *     -summary my.summary \
- *     -o my.bed
- * 
- *

- * would produce a BED file (my.bed) that looks like: - *

- *

- *     20 10000000 10000864 PASS
- *     20 10000865 10000985 POOR_MAPPING_QUALITY
- *     20 10000986 10001138 PASS
- *     20 10001139 10001254 POOR_MAPPING_QUALITY
- *     20 10001255 10012255 PASS
- *     20 10012256 10012259 POOR_MAPPING_QUALITY
- *     20 10012260 10012263 PASS
- *     20 10012264 10012328 POOR_MAPPING_QUALITY
- *     20 10012329 10012550 PASS
- *     20 10012551 10012551 LOW_COVERAGE
- *     20 10012552 10012554 PASS
- *     20 10012555 10012557 LOW_COVERAGE
- *     20 10012558 10012558 PASS
- *     et cetera...
- * 
- * as well as a summary table that looks like: - *

- *

- *                        state nBases
- *                        REF_N 0
- *                     PASS 996046
- *                  NO_COVERAGE 121
- *                 LOW_COVERAGE 928
- *           EXCESSIVE_COVERAGE 0
- *         POOR_MAPPING_QUALITY 2906
- * 
- * - * @author Mark DePristo - * @since May 7, 2010 - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@By(DataSource.REFERENCE) -public class CallableLoci extends LocusWalker { - @Output - PrintStream out; - - /** - * Callable loci summary counts (see outputs) will be written to this file. - */ - @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) - File summaryFile; - - /** - * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but - * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with - * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where - * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to - * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states - */ - @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false) - byte maxLowMAPQ = 1; - - /** - * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS - * state. - */ - @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) - byte minMappingQuality = 10; - - /** - * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state - */ - @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) - byte minBaseQuality = 20; - - /** - * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this - * value and is less than maxDepth the site is considered PASS. - */ - @Advanced - @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) - int minDepth = 4; - - /** - * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH - */ - @Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) - int maxDepth = -1; - - /** - * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We - * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads - * covering the site. - */ - @Advanced - @Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false) - int minDepthLowMAPQ = 10; - - /** - * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality - * exceeds this fraction then the site has POOR_MAPPING_QUALITY. - */ - @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false) - double maxLowMAPQFraction = 0.1; - - /** - * The output of this walker will be written in this format. The recommended option is BED. - */ - @Advanced - @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false) - OutputFormat outputFormat = OutputFormat.BED; - - public enum OutputFormat { - /** - * The output will be written as a BED file. There's a BED element for each - * continuous run of callable states (i.e., PASS, REF_N, etc). This is the recommended - * format - */ - BED, - - /** - * Emit chr start stop state quads for each base. Produces a potentially disasterously - * large amount of output. - */ - STATE_PER_BASE - } - - public enum CalledState { - /** - * the reference base was an N, which is not considered callable the GATK - */ - REF_N, - /** - * the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE - */ - CALLABLE, - /** - * absolutely no reads were seen at this locus, regardless of the filtering parameters - */ - NO_COVERAGE, - /** - * there were less than min. depth bases at the locus, after applying filters - */ - LOW_COVERAGE, - /** - * more than -maxDepth read at the locus, indicating some sort of mapping problem - */ - EXCESSIVE_COVERAGE, - /** - * more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads - */ - POOR_MAPPING_QUALITY - } - - //////////////////////////////////////////////////////////////////////////////////// - // STANDARD WALKER METHODS - //////////////////////////////////////////////////////////////////////////////////// - - @Override - public boolean includeReadsWithDeletionAtLoci() { - return true; - } - - @Override - public void initialize() { - if (getSampleDB().getSamples().size() != 1) { - throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getSampleDB().getSamples()); - } - - try { - PrintStream summaryOut = new PrintStream(summaryFile); - summaryOut.close(); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(summaryFile, e); - } - } - - protected static class Integrator { - final long counts[] = new long[CalledState.values().length]; - CallableBaseState state = null; - } - - protected static class CallableBaseState implements HasGenomeLocation { - final public GenomeLocParser genomeLocParser; - public GenomeLoc loc; - final public CalledState state; - - public CallableBaseState(GenomeLocParser genomeLocParser, GenomeLoc loc, CalledState state) { - this.genomeLocParser = genomeLocParser; - this.loc = loc; - this.state = state; - } - - public GenomeLoc getLocation() { - return loc; - } - - public CalledState getState() { - return state; - } - - // update routines - public boolean changingState(CalledState newState) { - return state != newState; - } - - /** - * Updating the location of this CalledBaseState by the new stop location - * - * @param newStop - */ - public void update(GenomeLoc newStop) { - loc = genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), newStop.getStop()); - } - - public String toString() { - return String.format("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart()-1, loc.getStop(), state); - } - } - - @Override - public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - CalledState state; - - if ( BaseUtils.isNBase(ref.getBase())) { - state = CalledState.REF_N; - } else { - // count up the depths of all and QC+ bases - int rawDepth = 0, QCDepth = 0, lowMAPQDepth = 0; - for (PileupElement e : context.getBasePileup()) { - final int depth = e.getRepresentativeCount(); - rawDepth += depth; - - if (e.getMappingQual() <= maxLowMAPQ) - lowMAPQDepth += depth; - - if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { - QCDepth += depth; - } - } - - //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth); - if (rawDepth == 0) { - state = CalledState.NO_COVERAGE; - } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) { - state = CalledState.POOR_MAPPING_QUALITY; - } else if (QCDepth < minDepth) { - state = CalledState.LOW_COVERAGE; - } else if (rawDepth >= maxDepth && maxDepth != -1) { - state = CalledState.EXCESSIVE_COVERAGE; - } else { - state = CalledState.CALLABLE; - } - } - - return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state); - } - - @Override - public Integrator reduceInit() { - return new Integrator(); - } - - @Override - public Integrator reduce(CallableBaseState state, Integrator integrator) { - // update counts - integrator.counts[state.getState().ordinal()]++; - - if (outputFormat == OutputFormat.STATE_PER_BASE) { - out.println(state.toString()); - } - - // format is integrating - if (integrator.state == null) - integrator.state = state; - else if (state.getLocation().getStart() != integrator.state.getLocation().getStop() + 1 || - integrator.state.changingState(state.getState())) { - out.println(integrator.state.toString()); - integrator.state = state; - } else { - integrator.state.update(state.getLocation()); - } - - return integrator; - } - - - //////////////////////////////////////////////////////////////////////////////////// - // INTERVAL ON TRAVERSAL DONE - //////////////////////////////////////////////////////////////////////////////////// - - @Override - public void onTraversalDone(Integrator result) { - // print out the last state - if (result != null) { - if (outputFormat == OutputFormat.BED) // get the last interval - out.println(result.state.toString()); - - try { - PrintStream summaryOut = new PrintStream(summaryFile); - summaryOut.printf("%30s %s%n", "state", "nBases"); - for (CalledState state : CalledState.values()) { - summaryOut.printf("%30s %d%n", state, result.counts[state.ordinal()]); - } - summaryOut.close(); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(summaryFile, e); - } - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java deleted file mode 100644 index f0d6f7301..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ /dev/null @@ -1,241 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.coverage; - -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.pileup.PileupElement; - -import java.util.*; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Mar 3, 2010 - */ -public class CoverageUtils { - - public enum CountPileupType { - /** - * Count all reads independently (even if from the same fragment). - */ - COUNT_READS, - /** - * Count all fragments (even if the reads that compose the fragment are not consistent at that base). - */ - COUNT_FRAGMENTS, - /** - * Count all fragments (but only if the reads that compose the fragment are consistent at that base). - */ - COUNT_FRAGMENTS_REQUIRE_SAME_BASE - } - - /** - * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context - * as an array of ints, indexed by the index fields of BaseUtils - * - * @param context - * @param minMapQ - * @param minBaseQ - * @return - */ - public static int[] getBaseCounts(AlignmentContext context, int minMapQ, int minBaseQ) { - int[] counts = new int[6]; - - for (PileupElement e : context.getBasePileup()) { - if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) { - updateCounts(counts,e); - } - } - - return counts; - } - - public static String getTypeID( SAMReadGroupRecord r, DoCOutputType.Partition type ) { - if ( type == DoCOutputType.Partition.sample ) { - return r.getSample(); - } else if ( type == DoCOutputType.Partition.readgroup ) { - return String.format("%s_rg_%s",r.getSample(),r.getReadGroupId()); - } else if ( type == DoCOutputType.Partition.library ) { - return r.getLibrary(); - } else if ( type == DoCOutputType.Partition.center ) { - return r.getSequencingCenter(); - } else if ( type == DoCOutputType.Partition.platform ) { - return r.getPlatform(); - } else if ( type == DoCOutputType.Partition.sample_by_center ) { - return String.format("%s_cn_%s",r.getSample(),r.getSequencingCenter()); - } else if ( type == DoCOutputType.Partition.sample_by_platform) { - return String.format("%s_pl_%s",r.getSample(),r.getPlatform()); - } else if ( type == DoCOutputType.Partition.sample_by_platform_by_center ) { - return String.format("%s_pl_%s_cn_%s",r.getSample(),r.getPlatform(),r.getSequencingCenter()); - } else { - throw new ReviewedStingException("Invalid type ID sent to getTypeID. This is a BUG!"); - } - } - - public static Map> - getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection types) { - - Map> countsByIDByType = new HashMap>(); - Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType); - for (DoCOutputType.Partition t : types ) { - // iterate through the read group counts and build the type associations - for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) { - String typeID = getTypeID(readGroupCountEntry.getKey(),t); - - if ( ! countsByIDByType.keySet().contains(t) ) { - countsByIDByType.put(t,new HashMap()); - } - - if ( ! countsByIDByType.get(t).keySet().contains(typeID) ) { - countsByIDByType.get(t).put(typeID,readGroupCountEntry.getValue().clone()); - } else { - addCounts(countsByIDByType.get(t).get(typeID),readGroupCountEntry.getValue()); - } - } - } - - - return countsByIDByType; - } - - public static void addCounts(int[] updateMe, int[] leaveMeAlone ) { - for ( int index = 0; index < leaveMeAlone.length; index++ ) { - updateMe[index] += leaveMeAlone[index]; - } - } - - public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) { - Map countsByRG = new HashMap(); - - List countPileup = new LinkedList(); - FragmentCollection fpile; - - switch (countType) { - - case COUNT_READS: - for (PileupElement e : context.getBasePileup()) - if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) - countPileup.add(e); - break; - - case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters: - fpile = context.getBasePileup().getStartSortedPileup().toFragments(); - - for (PileupElement e : fpile.getSingletonReads()) - if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) - countPileup.add(e); - - for (List overlappingPair : fpile.getOverlappingPairs()) { - // iterate over all elements in fragment: - for (PileupElement e : overlappingPair) { - if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) { - countPileup.add(e); // add the first passing element per fragment - break; - } - } - } - break; - - case COUNT_FRAGMENTS_REQUIRE_SAME_BASE: - fpile = context.getBasePileup().getStartSortedPileup().toFragments(); - - for (PileupElement e : fpile.getSingletonReads()) - if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) - countPileup.add(e); - - for (List overlappingPair : fpile.getOverlappingPairs()) { - PileupElement firstElem = null; - PileupElement addElem = null; - - // iterate over all elements in fragment: - for (PileupElement e : overlappingPair) { - if (firstElem == null) - firstElem = e; - else if (e.getBase() != firstElem.getBase()) { - addElem = null; - break; - } - - // will add the first passing element per base-consistent fragment: - if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) - addElem = e; - } - - if (addElem != null) - countPileup.add(addElem); - } - break; - - default: - throw new UserException("Must use valid CountPileupType"); - } - - for (PileupElement e : countPileup) { - SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); - if (!countsByRG.keySet().contains(readGroup)) - countsByRG.put(readGroup, new int[6]); - - updateCounts(countsByRG.get(readGroup), e); - } - - return countsByRG; - } - - private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { - return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() )); - } - - private static void updateCounts(int[] counts, PileupElement e) { - if ( e.isDeletion() ) { - counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount(); - } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { - counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount(); - } else { - try { - counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); - } catch (ArrayIndexOutOfBoundsException exc) { - throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase()); - } - } - } - - private static SAMReadGroupRecord getReadGroup(SAMRecord r) { - SAMReadGroupRecord rg = r.getReadGroup(); - if ( rg == null ) { - String msg = "Read "+r.getReadName()+" lacks read group information; Please associate all reads with read groups"; - throw new UserException.MalformedBAM(r, msg); - } - - return rg; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java deleted file mode 100644 index ca3255097..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ /dev/null @@ -1,1094 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.coverage; - -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library - * - *

- * Coverage processes a set of bam files to determine coverage at different levels of partitioning and - * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by - * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, - * and/or percentage of bases covered to or beyond a threshold. - * Additionally, reads and bases can be filtered by mapping or base quality score. - * - *

Input

- *

- * One or more bam files (with proper headers) to be analyzed for coverage statistics - *

- *

- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *

- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) - *

- *

Output

- *

- * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: - *

- * - no suffix: per locus coverage - *

- * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases - *

- * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases - *

- * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval - *

- * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples - *

- * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene - *

- * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples - *

- * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases - *

- * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T DepthOfCoverage \
- *   -o file_name_base \
- *   -I input_bams.list
- *   [-geneList refSeq.sorted.txt] \
- *   [-pt readgroup] \
- *   [-ct 4 -ct 6 -ct 10] \
- *   [-L my_capture_genes.interval_list]
- * 
- * - */ -// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time -// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) -// todo -- alter logarithmic scaling to spread out bins more -// todo -- allow for user to set linear binning (default is logarithmic) -// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@By(DataSource.REFERENCE) -@PartitionBy(PartitionType.NONE) -@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE) -public class DepthOfCoverage extends LocusWalker>, CoveragePartitioner> implements TreeReducible { - @Output - @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) - Map out; - - @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false) - int minMappingQuality = -1; - @Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false) - int maxMappingQuality = Integer.MAX_VALUE; - - @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false) - byte minBaseQuality = -1; - @Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false) - byte maxBaseQuality = Byte.MAX_VALUE; - - @Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false) - CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS; - - /** - * Instead of reporting depth, report the base pileup at each locus - */ - @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false) - boolean printBaseCounts = false; - - /** - * Do not tabulate locus statistics (# loci covered by sample by coverage) - */ - @Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false) - boolean omitLocusTable = false; - - /** - * Do not tabulate interval statistics (mean, median, quartiles AND # intervals by sample by coverage) - */ - @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Will omit the per-interval statistics section, which should result in speedup", required = false) - boolean omitIntervals = false; - /** - * Do not print the total coverage at every base - */ - @Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Will omit the output of the depth of coverage at each base, which should result in speedup", required = false) - boolean omitDepthOutput = false; - - /** - * Path to the RefSeq file for use in aggregating coverage statistics over genes - */ - @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false) - File refSeqGeneList = null; - - /** - * The format of the output file - */ - @Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false) - String outputFormat = "rtable"; - - - // --------------------------------------------------------------------------- - // - // Advanced arguments - // - // --------------------------------------------------------------------------- - @Advanced - @Argument(fullName = "includeRefNSites", doc = "If provided, sites with reference N bases but with coverage from neighboring reads will be included in DoC calculations.", required = false) - boolean includeRefNBases = false; - - @Advanced - @Argument(fullName = "printBinEndpointsAndExit", doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data.", required = false) - boolean printBinEndpointsAndExit = false; - - /** - * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. - */ - @Advanced - @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) - int start = 1; - /** - * Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin. - */ - @Advanced - @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) - int stop = 500; - /** - * Sets the number of bins for granular binning - */ - @Advanced - @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) - int nBins = 499; - - /** - * Do not tabulate the sample summary statistics (total, mean, median, quartile coverage per sample) - */ - @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false) - boolean omitSampleSummary = false; - /** - * A way of partitioning reads into groups. Can be sample, readgroup, or library. - */ - @Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false) - Set partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); - - /** - * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. - */ - @Advanced - @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) - boolean includeDeletions = false; - - @Advanced - @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) - boolean ignoreDeletionSites = false; - - /** - * A coverage threshold for summarizing (e.g. % bases >= CT for each sample) - */ - @Advanced - @Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false) - int[] coverageThresholds = {15}; - - String[] OUTPUT_FORMATS = {"table","rtable","csv"}; - String separator = "\t"; - Map> orderCheck = new HashMap>(); - - //////////////////////////////////////////////////////////////////////////////////// - // STANDARD WALKER METHODS - //////////////////////////////////////////////////////////////////////////////////// - - public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } - - public void initialize() { - - if ( printBinEndpointsAndExit ) { - int[] endpoints = DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins); - System.out.print("[ "); - for ( int e : endpoints ) { - System.out.print(e+" "); - } - System.out.println("]"); - System.exit(0); - } - - // Check the output format - boolean goodOutputFormat = false; - for ( String f : OUTPUT_FORMATS ) { - goodOutputFormat = goodOutputFormat || f.equals(outputFormat); - } - - if ( ! goodOutputFormat ) { - throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat); - } - - if ( outputFormat.equals("csv") ) { - separator = ","; - } - - if ( ! omitDepthOutput ) { // print header - PrintStream out = getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary); - out.printf("%s\t%s","Locus","Total_Depth"); - for (DoCOutputType.Partition type : partitionTypes ) { - out.printf("\t%s_%s","Average_Depth",type.toString()); - } - - // get all the samples - HashSet allSamples = getSamplesFromToolKit(partitionTypes); - ArrayList allSampleList = new ArrayList(allSamples.size()); - for ( String s : allSamples ) { - allSampleList.add(s); - } - Collections.sort(allSampleList); - - for ( String s : allSampleList) { - out.printf("\t%s_%s","Depth_for",s); - if ( printBaseCounts ) { - out.printf("\t%s_%s",s,"base_counts"); - } - } - - out.printf("%n"); - - } else { - logger.info("Per-Locus Depth of Coverage output was omitted"); - } - - for (DoCOutputType.Partition type : partitionTypes ) { - orderCheck.put(type,new ArrayList()); - for ( String id : getSamplesFromToolKit(type) ) { - orderCheck.get(type).add(id); - } - Collections.sort(orderCheck.get(type)); - } - } - - private HashSet getSamplesFromToolKit( Collection types ) { - HashSet partitions = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order - for (DoCOutputType.Partition t : types ) { - partitions.addAll(getSamplesFromToolKit(t)); - } - - return partitions; - } - - private HashSet getSamplesFromToolKit(DoCOutputType.Partition type) { - HashSet partition = new HashSet(); - if ( type == DoCOutputType.Partition.sample ) { - partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); - } else if ( type == DoCOutputType.Partition.readgroup ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); - } - } else if ( type == DoCOutputType.Partition.library ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getLibrary()); - } - } else if ( type == DoCOutputType.Partition.center ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getSequencingCenter()); - } - } else if ( type == DoCOutputType.Partition.platform ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getPlatform()); - } - } else if ( type == DoCOutputType.Partition.sample_by_center ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(String.format("%s_cn_%s",rg.getSample(),rg.getSequencingCenter())); - } - } else if ( type == DoCOutputType.Partition.sample_by_platform ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(String.format("%s_pl_%s",rg.getSample(),rg.getPlatform())); - } - } else if ( type == DoCOutputType.Partition.sample_by_platform_by_center ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(String.format("%s_pl_%s_cn_%s",rg.getSample(),rg.getPlatform(),rg.getSequencingCenter())); - } - } else { - throw new ReviewedStingException("Invalid aggregation type sent to getSamplesFromToolKit"); - } - - return partition; - } - - public boolean isReduceByInterval() { - return ( ! omitIntervals ); - } - - public CoveragePartitioner reduceInit() { - CoveragePartitioner aggro = new CoveragePartitioner(partitionTypes,start,stop,nBins); - for (DoCOutputType.Partition t : partitionTypes ) { - aggro.addIdentifiers(t,getSamplesFromToolKit(t)); - } - aggro.initialize(includeDeletions,omitLocusTable); - checkOrder(aggro); - return aggro; - } - - public Map> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) { - if ( ! omitDepthOutput ) { - getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) - //System.out.printf("\t[log]\t%s",ref.getLocus()); - } - - return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes); - } else { - return null; - } - } - - public CoveragePartitioner reduce(Map> thisMap, CoveragePartitioner prevReduce) { - if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns) - if ( ! omitDepthOutput ) { - //checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order - printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType()); - // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without - // turning on omit - } - - prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object - } - - return prevReduce; - } - - public CoveragePartitioner treeReduce(CoveragePartitioner left, CoveragePartitioner right) { - left.merge(right); - return left; - } - - //////////////////////////////////////////////////////////////////////////////////// - // INTERVAL ON TRAVERSAL DONE - //////////////////////////////////////////////////////////////////////////////////// - - public void onTraversalDone( List> statsByInterval ) { - if ( refSeqGeneList != null && partitionTypes.contains(DoCOutputType.Partition.sample) ) { - printGeneStats(statsByInterval); - } - - if ( statsByInterval.size() > 0 ) { - for(DoCOutputType.Partition partition: partitionTypes) { - if ( checkType(statsByInterval.get(0).getSecond().getCoverageByAggregationType(partition) ,partition) ) { - printIntervalStats(statsByInterval, - getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary), - getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics), - partition); - } else { - throw new ReviewedStingException("Partition type "+partition.toString()+" had no entries. Please check that your .bam header has all appropriate partition types."); - } - } - } else { - throw new UserException.CommandLineException("Cannot reduce by interval without interval list provided. Please provide a -L argument."); - } - - onTraversalDone(mergeAll(statsByInterval)); - - } - - public CoveragePartitioner mergeAll(List> stats) { - CoveragePartitioner first = stats.remove(0).second; - for ( Pair iStat : stats ) { - treeReduce(first,iStat.second); - } - - return first; - } - - private DepthOfCoverageStats printIntervalStats(List> statsByInterval, PrintStream summaryOut, PrintStream statsOut, DoCOutputType.Partition type) { - Pair firstPair = statsByInterval.get(0); - CoveragePartitioner firstAggregator = firstPair.second; - DepthOfCoverageStats firstStats = firstAggregator.getCoverageByAggregationType(type); - - StringBuilder summaryHeader = new StringBuilder(); - summaryHeader.append("Target"); - summaryHeader.append(separator); - summaryHeader.append("total_coverage"); - summaryHeader.append(separator); - summaryHeader.append("average_coverage"); - - for ( String s : firstStats.getAllSamples() ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_total_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_mean_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q1"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_median"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q3"); - for ( int thresh : coverageThresholds ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_%_above_"); - summaryHeader.append(thresh); - } - } - - summaryOut.printf("%s%n",summaryHeader); - - int[][] nTargetsByAvgCvgBySample = new int[firstStats.getHistograms().size()][firstStats.getEndpoints().length+1]; - - for ( Pair targetAggregator : statsByInterval ) { - - Pair targetStats = new Pair( - targetAggregator.first, targetAggregator.second.getCoverageByAggregationType(type)); - printTargetSummary(summaryOut,targetStats); - updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); - } - - printIntervalTable(statsOut,nTargetsByAvgCvgBySample,firstStats.getEndpoints()); - - return firstStats; - } - - private void printGeneStats(List> statsByTarget) { - logger.debug("statsByTarget size is "+Integer.toString(statsByTarget.size())); - logger.debug("Initializing refseq..."); - LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq(); - logger.debug("Refseq init done."); - List> statsByGene = new ArrayList>();// maintains order - Map geneNamesToStats = new HashMap(); // allows indirect updating of objects in list - - for ( Pair targetStats : statsByTarget ) { - String gene = getGeneName(targetStats.first,refseqIterator); - if ( geneNamesToStats.keySet().contains(gene) ) { - logger.debug("Merging "+geneNamesToStats.get(gene).toString()+" and "+targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample).toString()); - geneNamesToStats.get(gene).merge(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); - } else { - DepthOfCoverageStats merger = new DepthOfCoverageStats(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); - geneNamesToStats.put(gene,merger); - statsByGene.add(new Pair(gene,merger)); - } - } - - PrintStream geneSummaryOut = getCorrectStream(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); - StringBuilder summaryHeader = new StringBuilder(); - summaryHeader.append("Gene"); - summaryHeader.append(separator); - summaryHeader.append("total_coverage"); - summaryHeader.append(separator); - summaryHeader.append("average_coverage"); - - for ( String s : statsByTarget.get(0).second.getCoverageByAggregationType(DoCOutputType.Partition.sample).getAllSamples() ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_total_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_mean_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q1"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_median"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q3"); - for ( int thresh : coverageThresholds ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_%_above_"); - summaryHeader.append(thresh); - } - } - - geneSummaryOut.printf("%s%n",summaryHeader); - - for ( Pair geneStats : statsByGene ) { - printTargetSummary(geneSummaryOut,geneStats); - } - } - - //blatantly stolen from Andrew Kernytsky - private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) { - logger.debug("Examining "+target.toString()); - if (refseqIterator == null) { return "UNKNOWN"; } - - RODRecordList annotationList = refseqIterator.seekForward(target); - logger.debug("Annotation list is " + (annotationList == null ? "null" : annotationList.getName())); - if (annotationList == null) { return "UNKNOWN"; } - - for(GATKFeature rec : annotationList) { - if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) { - logger.debug("We do overlap "+ rec.getUnderlyingObject().toString()); - return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName(); - } - logger.debug("No overlap"); - } - - return "UNKNOWN"; - - } - - private LocationAwareSeekableRODIterator initializeRefSeq() { - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe, - getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,refSeqGeneList); - return new SeekableRODIterator(refseq.getHeader(),refseq.getSequenceDictionary(),getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(),refseq.getIterator()); - } - - private void printTargetSummary(PrintStream output, Pair intervalStats) { - DepthOfCoverageStats stats = intervalStats.second; - int[] bins = stats.getEndpoints(); - - StringBuilder targetSummary = new StringBuilder(); - targetSummary.append(intervalStats.first.toString()); - targetSummary.append(separator); - targetSummary.append(stats.getTotalCoverage()); - targetSummary.append(separator); - targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage())); - - for ( String s : stats.getAllSamples() ) { - targetSummary.append(separator); - targetSummary.append(stats.getTotals().get(s)); - targetSummary.append(separator); - targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); - targetSummary.append(separator); - int median = getQuantile(stats.getHistograms().get(s),0.5); - int q1 = getQuantile(stats.getHistograms().get(s),0.25); - int q3 = getQuantile(stats.getHistograms().get(s),0.75); - targetSummary.append(formatBin(bins,q1)); - targetSummary.append(separator); - targetSummary.append(formatBin(bins,median)); - targetSummary.append(separator); - targetSummary.append(formatBin(bins,q3)); - for ( int thresh : coverageThresholds ) { - targetSummary.append(String.format("%s%.1f",separator,getPctBasesAbove(stats.getHistograms().get(s),stats.value2bin(thresh)))); - } - - } - - output.printf("%s%n", targetSummary); - } - - private String formatBin(int[] bins, int quartile) { - if ( quartile >= bins.length ) { - return String.format(">%d",bins[bins.length-1]); - } else if ( quartile < 0 ) { - return String.format("<%d",bins[0]); - } else { - return String.format("%d",bins[quartile]); - } - } - - private void printIntervalTable(PrintStream output, int[][] intervalTable, int[] cutoffs) { - String colHeader = outputFormat.equals("rtable") ? "" : "Number_of_sources"; - output.printf(colHeader + separator+"depth>=%d",0); - for ( int col = 0; col < intervalTable[0].length-1; col ++ ) { - output.printf(separator+"depth>=%d",cutoffs[col]); - } - - output.printf(String.format("%n")); - for ( int row = 0; row < intervalTable.length; row ++ ) { - output.printf("At_least_%d_samples",row+1); - for ( int col = 0; col < intervalTable[0].length; col++ ) { - output.printf(separator+"%d",intervalTable[row][col]); - } - output.printf(String.format("%n")); - } - } - - /* - * @updateTargetTable - * The idea is to have counts for how many *targets* have at least K samples with - * median coverage of at least X. - * To that end: - * Iterate over the samples the DOCS object, determine how many there are with - * median coverage > leftEnds[0]; how many with median coverage > leftEnds[1] - * and so on. Then this target has at least N, N-1, N-2, ... 1, 0 samples covered - * to leftEnds[0] and at least M,M-1,M-2,...1,0 samples covered to leftEnds[1] - * and so on. - */ - private void updateTargetTable(int[][] table, DepthOfCoverageStats stats) { - int[] cutoffs = stats.getEndpoints(); - int[] countsOfMediansAboveCutoffs = new int[cutoffs.length+1]; // 0 bin to catch everything - for ( int i = 0; i < countsOfMediansAboveCutoffs.length; i ++) { - countsOfMediansAboveCutoffs[i]=0; - } - - for ( String s : stats.getAllSamples() ) { - int medianBin = getQuantile(stats.getHistograms().get(s),0.5); - for ( int i = 0; i <= medianBin; i ++) { - countsOfMediansAboveCutoffs[i]++; - } - } - - for ( int medianBin = 0; medianBin < countsOfMediansAboveCutoffs.length; medianBin++) { - for ( ; countsOfMediansAboveCutoffs[medianBin] > 0; countsOfMediansAboveCutoffs[medianBin]-- ) { - table[countsOfMediansAboveCutoffs[medianBin]-1][medianBin]++; - // the -1 is due to counts being 1-based and offsets being 0-based - } - } - } - - //////////////////////////////////////////////////////////////////////////////////// - // FINAL ON TRAVERSAL DONE - //////////////////////////////////////////////////////////////////////////////////// - - public void onTraversalDone(CoveragePartitioner coverageProfiles) { - /////////////////// - // OPTIONAL OUTPUTS - ////////////////// - - if ( ! omitSampleSummary ) { - logger.info("Printing summary info"); - for (DoCOutputType.Partition type : partitionTypes ) { - outputSummaryFiles(coverageProfiles,type); - } - } - - if ( ! omitLocusTable ) { - logger.info("Printing locus summary"); - for (DoCOutputType.Partition type : partitionTypes ) { - outputLocusFiles(coverageProfiles,type); - } - } - } - - private void outputLocusFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { - printPerLocus(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts), - getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions), - coverageProfiles.getCoverageByAggregationType(type),type); - } - - private void outputSummaryFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { - printPerSample(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics),coverageProfiles.getCoverageByAggregationType(type)); - printSummary(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary),coverageProfiles.getCoverageByAggregationType(type)); - } - - //////////////////////////////////////////////////////////////////////////////////// - // HELPER OUTPUT METHODS - //////////////////////////////////////////////////////////////////////////////////// - - private void printPerSample(PrintStream output,DepthOfCoverageStats stats) { - int[] leftEnds = stats.getEndpoints(); - - StringBuilder hBuilder = new StringBuilder(); - if ( ! outputFormat.equals("rTable")) { - hBuilder.append("Source_of_reads"); - } - hBuilder.append(separator); - hBuilder.append(String.format("from_0_to_%d)%s",leftEnds[0],separator)); - for ( int i = 1; i < leftEnds.length; i++ ) - hBuilder.append(String.format("from_%d_to_%d)%s",leftEnds[i-1],leftEnds[i],separator)); - hBuilder.append(String.format("from_%d_to_inf%n",leftEnds[leftEnds.length-1])); - output.print(hBuilder.toString()); - Map histograms = stats.getHistograms(); - - for ( Map.Entry p : histograms.entrySet() ) { - StringBuilder sBuilder = new StringBuilder(); - sBuilder.append(String.format("sample_%s",p.getKey())); - for ( long count : p.getValue() ) { - sBuilder.append(String.format("%s%d",separator,count)); - } - sBuilder.append(String.format("%n")); - output.print(sBuilder.toString()); - } - } - - private void printPerLocus(PrintStream output, PrintStream coverageOut, DepthOfCoverageStats stats, DoCOutputType.Partition partitionType) { - int[] endpoints = stats.getEndpoints(); - int samples = stats.getHistograms().size(); - - long[][] baseCoverageCumDist = stats.getLocusCounts(); - - // rows - # of samples - // columns - depth of coverage - - boolean printSampleColumnHeader = outputFormat.equals("csv") || outputFormat.equals("table"); - - StringBuilder header = new StringBuilder(); - if ( printSampleColumnHeader ) { - // mhanna 22 Aug 2010 - Deliberately force this header replacement to make sure integration tests pass. - // TODO: Update integration tests and get rid of this. - header.append(partitionType == DoCOutputType.Partition.readgroup ? "read_group" : partitionType.toString()); - } - header.append(String.format("%sgte_0",separator)); - for ( int d : endpoints ) { - header.append(String.format("%sgte_%d",separator,d)); - } - header.append(String.format("%n")); - - output.print(header); - coverageOut.print(header); - - for ( int row = 0; row < samples; row ++ ) { - output.printf("%s_%d","NSamples",row+1); - for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) { - output.printf("%s%d",separator,baseCoverageCumDist[row][depthBin]); - } - output.printf("%n"); - } - - for ( String sample : stats.getAllSamples() ) { - coverageOut.printf("%s",sample); - double[] coverageDistribution = stats.getCoverageProportions(sample); - for ( int bin = 0; bin < coverageDistribution.length; bin ++ ) { - coverageOut.printf("%s%.2f",separator,coverageDistribution[bin]); - } - coverageOut.printf("%n"); - } - } - - private PrintStream getCorrectStream(DoCOutputType.Partition partition, DoCOutputType.Aggregation aggregation, DoCOutputType.FileType fileType) { - DoCOutputType outputType = new DoCOutputType(partition,aggregation,fileType); - if(!out.containsKey(outputType)) - throw new UserException.CommandLineException(String.format("Unable to find appropriate stream for partition = %s, aggregation = %s, file type = %s",partition,aggregation,fileType)); - return out.get(outputType); - } - - private void printSummary(PrintStream output, DepthOfCoverageStats stats) { - if ( ! outputFormat.equals("csv") ) { - output.printf("%s\t%s\t%s\t%s\t%s\t%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); - } else { - output.printf("%s,%s,%s,%s,%s,%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); - } - - for ( int thresh : coverageThresholds ) { - output.printf("%s%s%d",separator,"%_bases_above_",thresh); - } - - output.printf("%n"); - - Map histograms = stats.getHistograms(); - Map means = stats.getMeans(); - Map totals = stats.getTotals(); - int[] leftEnds = stats.getEndpoints(); - - for ( Map.Entry p : histograms.entrySet() ) { - String s = p.getKey(); - long[] histogram = p.getValue(); - int median = getQuantile(histogram,0.5); - int q1 = getQuantile(histogram,0.25); - int q3 = getQuantile(histogram,0.75); - // if any of these are larger than the higest bin, put the median as in the largest bin - median = median == histogram.length-1 ? histogram.length-2 : median; - q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; - q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; - if ( ! outputFormat.equals("csv") ) { - output.printf("%s\t%d\t%.2f\t%d\t%d\t%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); - } else { - output.printf("%s,%d,%.2f,%d,%d,%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); - } - - for ( int thresh : coverageThresholds ) { - output.printf("%s%.1f",separator,getPctBasesAbove(histogram,stats.value2bin(thresh))); - } - - output.printf("%n"); - } - - if ( ! outputFormat.equals("csv") ) { - output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); - } else { - output.printf("%s,%d,%.2f,%s,%s,%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); - } - } - - private int getQuantile(long[] histogram, double prop) { - int total = 0; - - for ( int i = 0; i < histogram.length; i ++ ) { - total += histogram[i]; - } - - int counts = 0; - int bin = -1; - while ( counts < prop*total ) { - counts += histogram[bin+1]; - bin++; - } - - return bin == -1 ? 0 : bin; - } - - private double getPctBasesAbove(long[] histogram, int bin) { - long below = 0l; - long above = 0l; - for ( int index = 0; index < histogram.length; index++) { - if ( index < bin ) { - below+=histogram[index]; - } else { - above+=histogram[index]; - } - } - - return 100*( (double) above )/( above + below ); - } - - private void printDepths(PrintStream stream, Map> countsBySampleByType, Map> identifiersByType) { - // get the depths per sample and build up the output string while tabulating total and average coverage - StringBuilder perSampleOutput = new StringBuilder(); - int tDepth = 0; - boolean depthCounted = false; - for (DoCOutputType.Partition type : partitionTypes ) { - Map countsByID = countsBySampleByType.get(type); - for ( String s : identifiersByType.get(type) ) { - perSampleOutput.append(separator); - long dp = (countsByID != null && countsByID.keySet().contains(s)) ? sumArray(countsByID.get(s)) : 0 ; - perSampleOutput.append(dp); - if ( printBaseCounts ) { - perSampleOutput.append(separator); - perSampleOutput.append(baseCounts(countsByID != null ? countsByID.get(s) : null )); - } - if ( ! depthCounted ) { - tDepth += dp; - } - } - depthCounted = true; // only sum the total depth once - } - - // remember -- genome locus was printed in map() - stream.printf("%s%d",separator,tDepth); - for (DoCOutputType.Partition type : partitionTypes ) { - stream.printf("%s%.2f",separator, ( (double) tDepth / identifiersByType.get(type).size() ) ); - } - stream.printf("%s%n",perSampleOutput); - } - - private long sumArray(int[] array) { - long i = 0; - for ( int j : array ) { - i += j; - } - return i; - } - - private String baseCounts(int[] counts) { - if ( counts == null ) { - counts = new int[6]; - } - StringBuilder s = new StringBuilder(); - int nbases = 0; - for ( byte b : BaseUtils.EXTENDED_BASES ) { - nbases++; - if ( includeDeletions || b != BaseUtils.Base.D.base ) { - s.append((char)b); - s.append(":"); - s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]); - if ( nbases < 6 ) { - s.append(" "); - } - } - } - - return s.toString(); - } - - private void checkOrder(CoveragePartitioner ag) { - // make sure the ordering stored at initialize() is propagated along reduce - for (DoCOutputType.Partition t : partitionTypes ) { - List order = orderCheck.get(t); - List namesInAg = ag.getIdentifiersByType().get(t); - - // todo -- chris check me - Set namesInDOCS = ag.getCoverageByAggregationType(t).getAllSamples(); - int index = 0; - for ( String s : namesInAg ) { - if ( ! s.equalsIgnoreCase(order.get(index)) ) { - throw new ReviewedStingException("IDs are out of order for type "+t+"! Aggregator has different ordering"); - } - index++; - } - } - } - - public boolean checkType(DepthOfCoverageStats stats, DoCOutputType.Partition type ) { - if ( stats.getHistograms().size() < 1 ) { - logger.warn("The histogram per partition type "+type.toString()+" was empty\n"+ - "Do your read groups have this type? (Check your .bam header)."); - return false; - } else { - return true; - } - } - -} - -class DoCOutputMultiplexer implements Multiplexer { - private final Set partitions; - private final File refSeqGeneList; - private final boolean omitDepthOutput; - private final boolean omitIntervals; - private final boolean omitSampleSummary; - private final boolean omitLocusTable; - - /** - * Create a new multiplexer type using the values of all variable fields. - * @param partitions - * @param refSeqGeneList - * @param omitDepthOutput - * @param omitIntervals - * @param omitSampleSummary - * @param omitLocusTable - */ - public DoCOutputMultiplexer(final Set partitions, - final File refSeqGeneList, - final boolean omitDepthOutput, - final boolean omitIntervals, - final boolean omitSampleSummary, - final boolean omitLocusTable) { - this.partitions = partitions; - this.refSeqGeneList = refSeqGeneList; - this.omitDepthOutput = omitDepthOutput; - this.omitIntervals = omitIntervals; - this.omitSampleSummary = omitSampleSummary; - this.omitLocusTable = omitLocusTable; - } - - public Collection multiplex() { - List outputs = new ArrayList(); - if(!omitDepthOutput) outputs.add(new DoCOutputType(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary)); - - if(!omitIntervals) { - for(DoCOutputType.Partition partition: partitions) { - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary)); - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics)); - } - } - - if(refSeqGeneList != null && partitions.contains(DoCOutputType.Partition.sample)) { - DoCOutputType geneSummaryOut = new DoCOutputType(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); - outputs.add(geneSummaryOut); - } - - if(!omitSampleSummary) { - for(DoCOutputType.Partition partition: partitions) { - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary)); - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics)); - } - } - - if(!omitLocusTable) { - for(DoCOutputType.Partition partition: partitions) { - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts)); - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions)); - } - } - - return outputs; - } - - public String transformArgument(final DoCOutputType outputType, final String argument) { - return outputType.getFileName(argument); - } - -} - -class CoveragePartitioner { - private Collection types; - private Map coverageProfiles; - private Map> identifiersByType; - private Set allIdentifiers; - public CoveragePartitioner(Collection typesToUse, int start, int stop, int nBins) { - coverageProfiles = new HashMap(); - identifiersByType = new HashMap>(); - types = typesToUse; - for ( DoCOutputType.Partition type : types ) { - coverageProfiles.put(type,new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins))); - identifiersByType.put(type,new ArrayList()); - } - allIdentifiers = new HashSet(); - } - - public void merge(CoveragePartitioner otherAggregator) { - for ( DoCOutputType.Partition type : types ) { - this.coverageProfiles.get(type).merge(otherAggregator.coverageProfiles.get(type)); - } - } - - public DepthOfCoverageStats getCoverageByAggregationType(DoCOutputType.Partition t) { - return coverageProfiles.get(t); - } - - public void addIdentifiers(DoCOutputType.Partition t, Set ids) { - for ( String s : ids ) { - coverageProfiles.get(t).addSample(s); - identifiersByType.get(t).add(s); - allIdentifiers.add(s); - } - Collections.sort(identifiersByType.get(t)); - } - - public void initialize(boolean useDels, boolean omitLocusTable) { - for ( DoCOutputType.Partition t : types ) { - if ( useDels ) { - coverageProfiles.get(t).initializeDeletions(); - } - if ( ! omitLocusTable ) { - coverageProfiles.get(t).initializeLocusCounts(); - } - } - } - - public void update(Map> countsByIdentifierByType) { - for ( DoCOutputType.Partition t : types ) { - coverageProfiles.get(t).update(countsByIdentifierByType.get(t)); - } - } - - public Set getAllIdentifiers() { - return allIdentifiers; - } - - public Map> getIdentifiersByType() { - return identifiersByType; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java deleted file mode 100644 index 533c7be73..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java +++ /dev/null @@ -1,169 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.Arrays; - -/** - * At every locus in the input set, compares the pileup data (reference base, aligned base from - * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data - * should be specified using the command-line argument '-pileup:SAMPileup '. - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@Requires(value={DataSource.READS,DataSource.REFERENCE}) -public class CheckPileup extends LocusWalker implements TreeReducible { - @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true) - RodBinding pileup; - - @Output - private PrintStream out; - - @Argument(fullName="continue_after_error",doc="Continue after an error",required=false) - public boolean CONTINUE_AFTER_AN_ERROR = false; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - ReadBackedPileup pileup = context.getBasePileup(); - SAMPileupFeature truePileup = getTruePileup( tracker ); - - if ( truePileup == null ) { - out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar())); - if ( ! CONTINUE_AFTER_AN_ERROR ) { - throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", - context.getLocation(), new String(pileup.getBases()))); - } - } else { - String pileupDiff = pileupDiff(pileup, truePileup, true); - if ( pileupDiff != null ) { - out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString()); - if ( ! CONTINUE_AFTER_AN_ERROR ) { - throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff)); - } - } - } - - return pileup.getNumberOfElements(); - } - - private static String maybeSorted( final String x, boolean sortMe ) - { - if ( sortMe ) { - byte[] bytes = x.getBytes(); - Arrays.sort(bytes); - return new String(bytes); - } - else - return x; - } - - public String pileupDiff(final ReadBackedPileup a, final SAMPileupFeature b, boolean orderDependent) - { - if ( a.getNumberOfElements() != b.size() ) - return "Sizes not equal"; - GenomeLoc featureLocation = getToolkit().getGenomeLocParser().createGenomeLoc(b.getChr(),b.getStart(),b.getEnd()); - if ( a.getLocation().compareTo(featureLocation) != 0 ) - return "Locations not equal"; - - String aBases = maybeSorted(new String(a.getBases()), ! orderDependent ); - String bBases = maybeSorted(b.getBasesAsString(), ! orderDependent ); - if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) ) - return "Bases not equal"; - - String aQuals = maybeSorted(new String(a.getQuals()), ! orderDependent ); - String bQuals = maybeSorted(new String(b.getQuals()), ! orderDependent ); - if ( ! aQuals.equals(bQuals) ) - return "Quals not equal"; - - return null; - } - - // Given result of map function - public CheckPileupStats reduceInit() { return new CheckPileupStats(); } - public CheckPileupStats reduce(Integer value, CheckPileupStats sum) { - sum.nLoci++; - sum.nBases += value; - return sum; - } - - public CheckPileupStats treeReduce( CheckPileupStats lhs, CheckPileupStats rhs ) { - CheckPileupStats combined = new CheckPileupStats(); - combined.nLoci = lhs.nLoci + rhs.nLoci; - combined.nBases = lhs.nBases + rhs.nBases; - return combined; - } - - /** - * Extracts the true pileup data from the given rodSAMPileup. Note that this implementation - * assumes that the genotype will only be point or indel. - * @param tracker ROD tracker from which to extract pileup data. - * @return True pileup data. - */ - private SAMPileupFeature getTruePileup( RefMetaDataTracker tracker ) { - SAMPileupFeature pileupArg = tracker.getFirstValue(pileup); - - if( pileupArg == null) - return null; - - if( pileupArg.hasPointGenotype() ) - return pileupArg.getPointGenotype(); - else if( pileupArg.hasIndelGenotype() ) - return pileupArg.getIndelGenotype(); - else - throw new ReviewedStingException("Unsupported pileup type: " + pileupArg); - } -} - -class CheckPileupStats { - public long nLoci = 0; - public long nBases = 0; - - public CheckPileupStats() { - } - - public String toString() { - return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java deleted file mode 100644 index 7ec93e582..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ /dev/null @@ -1,112 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; - -/** - * a walker that simply throws errors. Allows us to test that the engine is behaving as expected with error handling - */ -@Hidden -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_TEST, extraDocs = {CommandLineGATK.class} ) -public class ErrorThrowing extends RefWalker implements TreeReducible, NanoSchedulable { - @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) - public String exceptionToThrow; - - @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) - public FailMethod failMethod = FailMethod.MAP; - - public enum FailMethod { - MAP, - REDUCE, - TREE_REDUCE - } - - // - // Template code to allow us to build the walker, doesn't actually do anything - // - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ref == null ) // only throw exception when we are in proper map, not special map(null) call - return null; - - if ( failMethod == FailMethod.MAP ) - fail(); - - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value != null && failMethod == FailMethod.REDUCE ) - fail(); - return sum; - } - - public Integer treeReduce(final Integer lhs, final Integer rhs) { - if ( failMethod == FailMethod.TREE_REDUCE ) - fail(); - return rhs; - } - - private void fail() { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedStingException") ) { - throw new ReviewedStingException("ReviewedStingException"); - } else if ( exceptionToThrow.equals("SamError1") ) { - throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); - } else if ( exceptionToThrow.equals("SamError2") ) { - throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); - } else if ( exceptionToThrow.equals("NoSpace1") ) { - throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); - } else if ( exceptionToThrow.equals("NoSpace2") ) { - throw new net.sf.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java deleted file mode 100644 index 23bbf1460..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java +++ /dev/null @@ -1,189 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Emulates the samtools pileup command to print aligned reads - * - *

Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. - * - * Emulated command: - * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] - * - *

Input

- *

- * A BAM file and the interval to print. - *

- * - *

Output

- *

- * Formatted pileup-style alignment of reads. - *

- * - *

Example

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -T Pileup \
- *   -R ref.fasta \
- *   -I aligned_reads.bam \
- *   -o output.txt
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { - - private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names - - @Output - PrintStream out; - - /** - * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions, - * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per - * read items are delimited with an '@' character. - */ - @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) - public boolean SHOW_VERBOSE = false; - - @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) - public List> rods = Collections.emptyList(); - - @Hidden - @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false) - public boolean outputInsertLength=false; - - @Override - public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String rods = getReferenceOrderedData( tracker ); - - ReadBackedPileup basePileup = context.getBasePileup(); - - final StringBuilder s = new StringBuilder(); - s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); - if ( outputInsertLength ) - s.append(" ").append(insertLengthOutput(basePileup)); - if ( SHOW_VERBOSE ) - s.append(" ").append(createVerboseOutput(basePileup)); - s.append("\n"); - - return s.toString(); - } - - // Given result of map function - @Override - public Integer reduceInit() { return 0; } - - @Override - public Integer reduce(String value, Integer sum) { - out.print(value); - return sum + 1; - } - - @Override - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - /** - * Get a string representation the reference-ordered data. - * @param tracker Container for the reference-ordered data. - * @return String representation of the reference-ordered data. - */ - private String getReferenceOrderedData( RefMetaDataTracker tracker ) { - ArrayList rodStrings = new ArrayList(); - for ( Feature datum : tracker.getValues(rods) ) { - rodStrings.add(datum.toString()); - } - String rodString = Utils.join(", ", rodStrings); - - if ( !rodString.equals("") ) - rodString = "[ROD: " + rodString + "]"; - - return rodString; - } - private static String insertLengthOutput(final ReadBackedPileup pileup) { - - Integer[] insertSizes=new Integer[pileup.depthOfCoverage()]; - - int i=0; - for ( PileupElement p : pileup ) { - insertSizes[i]=p.getRead().getInferredInsertSize(); - ++i; - } - return Utils.join(",",insertSizes); - } - - - private static String createVerboseOutput(final ReadBackedPileup pileup) { - final StringBuilder sb = new StringBuilder(); - boolean isFirst = true; - - sb.append(pileup.getNumberOfDeletions()); - sb.append(" "); - - for ( PileupElement p : pileup ) { - if ( isFirst ) - isFirst = false; - else - sb.append(","); - sb.append(p.getRead().getReadName()); - sb.append(verboseDelimiter); - sb.append(p.getOffset()); - sb.append(verboseDelimiter); - sb.append(p.getRead().getReadLength()); - sb.append(verboseDelimiter); - sb.append(p.getRead().getMappingQuality()); - } - return sb.toString(); - } - - @Override - public void onTraversalDone(Integer result) { - out.println("[REDUCE RESULT] Traversal result is: " + result); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java deleted file mode 100644 index cc8b3401e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java +++ /dev/null @@ -1,149 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.Arrays; - -/** - * User: depristo - * Date: May 5, 2010 - * Time: 12:16:41 PM - */ - -/** - * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length - * of the clipping to the output stream. - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@Requires({DataSource.READS}) -public class ReadClippingStats extends ReadWalker { - @Output - protected PrintStream out; - - @Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+ - "on mapped reads only, while unmapped reads will be discarded", required=false) - protected boolean MAPPED_ONLY = true; - - @Argument(fullName="skip", shortName="skip", doc="When provided, only every skip reads are analyzed", required=false) - protected int SKIP = 1; - -// public void initialize() { -// -// } - - public class ReadClippingInfo { - SAMReadGroupRecord rg; - int readLength, nClippingEvents, nClippedBases; - } - - public ReadClippingInfo map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - if ( AlignmentUtils.isReadUnmapped(read) && MAPPED_ONLY) - return null; - - ReadClippingInfo info = new ReadClippingInfo(); - info.rg = read.getReadGroup(); - - if ( info.rg == null ) throw new UserException.ReadMissingReadGroup(read); - - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - if ( elt.getOperator() != CigarOperator.N ) - - switch ( elt.getOperator()) { - case H : // ignore hard clips - case S : // soft clip - info.nClippingEvents++; - info.nClippedBases += elt.getLength(); - // note the fall through here - case M : - case D : // deletion w.r.t. the reference - case P : // ignore pads - case I : // insertion w.r.t. the reference - info.readLength += elt.getLength(); // Unless we have a reference skip, the read gets longer - break; - case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - break; - default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + elt.getOperator()); - } - } - - return info; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - out.println(Utils.join(" \t", Arrays.asList("ReadGroup", "ReadLength", "NClippingEvents", "NClippedBases", "PercentClipped"))); - return 0; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param info result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(ReadClippingInfo info, Integer sum) { - if ( info != null ) { - if ( sum % SKIP == 0 ) { - String id = info.rg.getReadGroupId(); - out.printf("%s\t %d\t %d\t %d\t %.2f%n", - id, info.readLength, info.nClippingEvents, info.nClippedBases, - 100.0 * MathUtils.ratio(info.nClippedBases, info.readLength)); - } - return sum + 1; //To change body of implemented methods use File | Settings | File Templates. - } else { - return sum; - } - } - - public void onTraversalDone(Integer result) { - - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java deleted file mode 100644 index 1362b109e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ /dev/null @@ -1,359 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; -import org.broadinstitute.variant.variantcontext.writer.Options; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; - -import java.util.*; - -/** - * Combines VCF records from different sources. - * - *

- * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data - * and any number of sources can be input. This tool currently supports two different combination types for each of - * variants (the first 8 fields of the VCF) and genotypes (the rest). - * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. - * Union: assumes each rod represents the same set of samples (although this is not enforced); using the - * priority list (if provided), it emits a single record instance at every position represented in the rods. - * - * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD - * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, - * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs - * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just - * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single - * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out - * in the detailed example in the documentation guide. - * - * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful - * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time - * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together - * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing - * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. - * - * Some fine details about the merging algorithm: - *

    - *
  • As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of - * the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the - * max QUAL, which resulted in sometime strange downstream confusion
  • - *
- * - *

Input

- *

- * One or more variant sets to combine. - *

- * - *

Output

- *

- * A combined VCF. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CombineVariants \
- *   --variant input1.vcf \
- *   --variant input2.vcf \
- *   -o output.vcf \
- *   -genotypeMergeOptions UNIQUIFY
- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CombineVariants \
- *   --variant:foo input1.vcf \
- *   --variant:bar input2.vcf \
- *   -o output.vcf \
- *   -genotypeMergeOptions PRIORITIZE
- *   -priority foo,bar
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-50,stop=50)) -public class CombineVariants extends RodWalker implements TreeReducible { - /** - * The VCF files to merge together - * - * variants can take any number of arguments on the command line. Each -V argument - * will be included in the final merged output VCF. If no explicit name is provided, - * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc. - * The user can override this by providing an explicit name -V:name,vcf for each -V argument, - * and each named argument will be labeled as such in the output (i.e., set=name rather than - * set=variants2). The order of arguments does not matter unless except for the naming, so - * if you provide an rod priority list and no explicit names than variants, variants2, etc - * are technically order dependent. It is strongly recommended to provide explicit names when - * a rod priority list is provided. - */ - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) - public GATKVariantContextUtils.GenotypeMergeType genotypeMergeOption = null; - - @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) - public GATKVariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; - - @Hidden - @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) - public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE; - - /** - * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. - */ - @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) - public String PRIORITY_STRING = null; - - @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) - public boolean printComplexMerges = false; - - @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) - public boolean filteredAreUncalled = false; - - /** - * Used to generate a sites-only file. - */ - @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) - public boolean minimalVCF = false; - - @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) - public boolean EXCLUDE_NON_VARIANTS = false; - - /** - * Set to 'null' if you don't want the set field emitted. - */ - @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) - public String SET_KEY = "set"; - - /** - * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. - */ - @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) - public boolean ASSUME_IDENTICAL_SAMPLES = false; - - @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) - public int minimumN = 1; - - /** - * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. - */ - @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) - public boolean SUPPRESS_COMMAND_LINE_HEADER = false; - - @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) - public boolean MERGE_INFO_WITH_MAX_AC = false; - - @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false) - public boolean COMBINE_ANNOTATIONS = false; - - private List priority = null; - - /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ - private boolean sitesOnlyVCF = false; - private Set samples; - - public void initialize() { - Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); - - if ( vcfWriter instanceof VariantContextWriterStub) { - sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); - if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); - } else - logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); - - validateAnnotateUnionArguments(); - if ( PRIORITY_STRING == null && genotypeMergeOption == null) { - genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED; - //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) - logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority); - } - - if (genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && - !SampleUtils.verifyUniqueSamplesNames(vcfRods)) - throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered."); - - samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); - - if ( SET_KEY.toLowerCase().equals("null") ) - SET_KEY = null; - - Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); - if ( SET_KEY != null ) - headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); - if ( !ASSUME_IDENTICAL_SAMPLES ) - headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); - VCFHeader vcfHeader = new VCFHeader(headerLines, samples); - vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); - vcfWriter.writeHeader(vcfHeader); - } - - private void validateAnnotateUnionArguments() { - Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); - - if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) - throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - - if ( PRIORITY_STRING != null){ - priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); - if ( rodNames.size() != priority.size() ) - throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); - - if ( ! rodNames.containsAll(priority) ) - throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); - } - - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); - // get all of the vcf rods at this locus - // Need to provide reference bases to simpleMerge starting at current locus - Collection vcs = tracker.getValues(variants, context.getLocation()); - Collection potentialRefVCs = tracker.getValues(variants); - potentialRefVCs.removeAll(vcs); - - if ( sitesOnlyVCF ) { - vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); - potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs); - } - - if ( ASSUME_IDENTICAL_SAMPLES ) { - for ( final VariantContext vc : vcs ) { - vcfWriter.add(vc); - } - - return vcs.isEmpty() ? 0 : 1; - } - - int numFilteredRecords = 0; - for (final VariantContext vc : vcs) { - if (vc.filtersWereApplied() && vc.isFiltered()) - numFilteredRecords++; - } - - if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) - return 0; - - final List mergedVCs = new ArrayList<>(); - - if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { - final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); - - // TODO -- clean this up in a refactoring - // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) - if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { - final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); - for ( final VariantContext.Type type : VariantContext.Type.values() ) { - if ( VCsByType.containsKey(type) ) { - VCsByType.get(type).addAll(refs); - break; - } - } - } - - // iterate over the types so that it's deterministic - for (final VariantContext.Type type : VariantContext.Type.values()) { - // make sure that it is a variant or in case it is not, that we want to include the sites with no variants - if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { - if (VCsByType.containsKey(type)) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs, - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); - } - } - } - } - else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs, - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); - } - else { - logger.warn("Ignoring all records at site " + ref.getLocus()); - } - - for ( final VariantContext mergedVC : mergedVCs ) { - // only operate at the start of events - if ( mergedVC == null ) - continue; - - final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); - // re-compute chromosome counts - VariantContextUtils.calculateChromosomeCounts(builder, false); - - if ( minimalVCF ) - GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - final VariantContext vc = builder.make(); - if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) - vcfWriter.add(builder.make()); - } - - return vcs.isEmpty() ? 0 : 1; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - @Override - public Integer treeReduce(Integer lhs, Integer rhs) { - return reduce(lhs, rhs); - } - - public void onTraversalDone(Integer sum) {} -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java deleted file mode 100755 index 848261d73..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java +++ /dev/null @@ -1,350 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFHeader; - -import java.util.*; - -/** - * A class for tabulating and evaluating a callset-by-callset genotype concordance table - * */ -public class ConcordanceMetrics { - - private Map perSampleGenotypeConcordance; - private GenotypeConcordanceTable overallGenotypeConcordance; - private SiteConcordanceTable overallSiteConcordance; - - public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth) { - HashSet overlappingSamples = new HashSet(evaluate.getGenotypeSamples()); - overlappingSamples.retainAll(truth.getGenotypeSamples()); - perSampleGenotypeConcordance = new HashMap(overlappingSamples.size()); - for ( String sample : overlappingSamples ) { - perSampleGenotypeConcordance.put(sample,new GenotypeConcordanceTable()); - } - overallGenotypeConcordance = new GenotypeConcordanceTable(); - overallSiteConcordance = new SiteConcordanceTable(); - } - - public GenotypeConcordanceTable getOverallGenotypeConcordance() { - return overallGenotypeConcordance; - } - - public SiteConcordanceTable getOverallSiteConcordance() { - return overallSiteConcordance; - } - - public GenotypeConcordanceTable getGenotypeConcordance(String sample) { - GenotypeConcordanceTable table = perSampleGenotypeConcordance.get(sample); - if ( table == null ) - throw new ReviewedStingException("Attempted to request the concordance table for sample "+sample+" on which it was not calculated"); - return table; - } - - public Map getPerSampleGenotypeConcordance() { - return Collections.unmodifiableMap(perSampleGenotypeConcordance); - } - - public Map getPerSampleNRD() { - Map nrd = new HashMap(perSampleGenotypeConcordance.size()); - for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { - nrd.put(sampleTable.getKey(),calculateNRD(sampleTable.getValue())); - } - - return Collections.unmodifiableMap(nrd); - } - - public Map getPerSampleOGC() { - Map ogc = new HashMap(perSampleGenotypeConcordance.size()); - for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { - ogc.put(sampleTable.getKey(),calculateOGC(sampleTable.getValue())); - } - - return Collections.unmodifiableMap(ogc); - } - - public Double getOverallNRD() { - return calculateNRD(overallGenotypeConcordance); - } - - public Double getOverallOGC() { - return calculateOGC(overallGenotypeConcordance); - } - - public Map getPerSampleNRS() { - Map nrs = new HashMap(perSampleGenotypeConcordance.size()); - for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { - nrs.put(sampleTable.getKey(),calculateNRS(sampleTable.getValue())); - } - - return Collections.unmodifiableMap(nrs); - } - - public Double getOverallNRS() { - return calculateNRS(overallGenotypeConcordance); - } - - @Requires({"eval != null","truth != null"}) - public void update(VariantContext eval, VariantContext truth) { - overallSiteConcordance.update(eval,truth); - Set alleleTruth = new HashSet(8); - String truthRef = truth.getReference().getBaseString(); - alleleTruth.add(truthRef); - for ( Allele a : truth.getAlternateAlleles() ) { - alleleTruth.add(a.getBaseString()); - } - for ( String sample : perSampleGenotypeConcordance.keySet() ) { - Genotype evalGenotype = eval.getGenotype(sample); - Genotype truthGenotype = truth.getGenotype(sample); - // ensure genotypes are either no-call ("."), missing (empty alleles), or diploid - if ( ( ! evalGenotype.isNoCall() && evalGenotype.getPloidy() != 2 && evalGenotype.getPloidy() > 0) || - ( ! truthGenotype.isNoCall() && truthGenotype.getPloidy() != 2 && truthGenotype.getPloidy() > 0) ) { - throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy())); - } - perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef); - overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); - } - } - - private static double calculateNRD(GenotypeConcordanceTable table) { - return calculateNRD(table.getTable()); - } - - private static double calculateNRD(int[][] concordanceCounts) { - int correct = 0; - int total = 0; - correct += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HET.ordinal()]; - correct += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_VAR.ordinal()]; - total += correct; - total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HET.ordinal()]; - total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_VAR.ordinal()]; - total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_REF.ordinal()]; - total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_VAR.ordinal()]; - total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_REF.ordinal()]; - total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HET.ordinal()]; - // NRD is by definition incorrec/total = 1.0-correct/total - // note: if there are no observations (so the ratio is NaN), set this to 100% - return total == 0 ? 1.0 : 1.0 - ( (double) correct)/( (double) total); - } - - private static double calculateOGC(int[][] concordanceCounts) { - int correct = 0; - int total = 0; - correct += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_REF.ordinal()]; - correct += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HET.ordinal()]; - correct += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_VAR.ordinal()]; - total += correct; - total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HET.ordinal()]; - total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_VAR.ordinal()]; - total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_REF.ordinal()]; - total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_VAR.ordinal()]; - total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_REF.ordinal()]; - total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HET.ordinal()]; - // OGC is by definition correct/total - // note: if there are no observations (so the ratio is NaN), set this to 100% - return total == 0 ? 1.0 : ( (double) correct)/( (double) total); - } - - private static double calculateNRS(GenotypeConcordanceTable table) { - return calculateNRS(table.getTable()); - } - - private static double calculateOGC(GenotypeConcordanceTable table) { - return calculateOGC(table.getTable()); - } - - private static double calculateNRS(int[][] concordanceCounts) { - long confirmedVariant = 0; - long unconfirmedVariant = 0; - for ( GenotypeType truthState : Arrays.asList(GenotypeType.HET,GenotypeType.HOM_VAR) ) { - for ( GenotypeType evalState : GenotypeType.values() ) { - if ( evalState == GenotypeType.MIXED ) - continue; - if ( evalState.equals(GenotypeType.HET) || evalState.equals(GenotypeType.HOM_VAR) ) - confirmedVariant += concordanceCounts[evalState.ordinal()][truthState.ordinal()]; - else - unconfirmedVariant += concordanceCounts[evalState.ordinal()][truthState.ordinal()]; - } - } - - long total = confirmedVariant + unconfirmedVariant; - // note: if there are no observations (so the ratio is NaN) set this to 0% - return total == 0l ? 0.0 : ( (double) confirmedVariant ) / ( (double) ( total ) ); - } - - - class GenotypeConcordanceTable { - - private int[][] genotypeCounts; - private int nMismatchingAlt; - - public GenotypeConcordanceTable() { - genotypeCounts = new int[GenotypeType.values().length][GenotypeType.values().length]; - nMismatchingAlt = 0; - } - - @Requires({"eval!=null","truth != null","truthAlleles != null"}) - public void update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) { - // this is slow but correct. - - // NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles - // that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will - // all match, so long as A and T are alleles in the truth callset. - boolean matchingAlt = true; - if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) { - // by default, no-calls "match" between alleles, so if - // one or both sites are no-call or unavailable, the alt alleles match - // otherwise, check explicitly: if the eval has an allele that's not ref, no-call, or present in truth - // the alt allele is mismatching - regardless of whether the genotype is correct. - for ( Allele evalAllele : eval.getAlleles() ) { - matchingAlt &= truthAlleles.contains(evalAllele.getBaseString()); - } - } else if ( eval.isCalled() && truth.isCalled() ) { - // otherwise, the eval genotype has to match either the alleles in the truth genotype, or the truth reference allele - // todo -- this can be sped up by caching the truth allele sets - Set genoAlleles = new HashSet(3); - genoAlleles.add(truthRef); - for ( Allele truthGenoAl : truth.getAlleles() ) { - genoAlleles.add(truthGenoAl.getBaseString()); - } - for ( Allele evalAllele : eval.getAlleles() ) { - matchingAlt &= genoAlleles.contains(evalAllele.getBaseString()); - } - } - - if ( matchingAlt ) { - genotypeCounts[eval.getType().ordinal()][truth.getType().ordinal()]++; - } else { - nMismatchingAlt++; - } - } - - public int[][] getTable() { - return genotypeCounts; - } - - public int getnMismatchingAlt() { - return nMismatchingAlt; - } - - public int getnEvalGenotypes(GenotypeType type) { - int nGeno = 0; - for ( GenotypeType comptype : GenotypeType.values() ) - nGeno += genotypeCounts[type.ordinal()][comptype.ordinal()]; - return nGeno; - } - - public int getnCalledEvalGenotypes() { - int nGeno = 0; - for ( GenotypeType evalType : Arrays.asList(GenotypeType.HOM_REF,GenotypeType.HOM_VAR,GenotypeType.HET) ) { - nGeno += getnEvalGenotypes(evalType); - } - - return nGeno + nMismatchingAlt; - } - - public int getnCompGenotypes(GenotypeType type) { - int nGeno = 0; - for ( GenotypeType evaltype : GenotypeType.values() ) - nGeno += genotypeCounts[evaltype.ordinal()][type.ordinal()]; - return nGeno; - } - - public int getnCalledCompGenotypes() { - int nGeno = 0; - for ( GenotypeType compType : Arrays.asList(GenotypeType.HOM_REF,GenotypeType.HOM_VAR,GenotypeType.HET) ) { - nGeno += getnCompGenotypes(compType); - } - return nGeno; - } - - public int get(GenotypeType evalType, GenotypeType compType) { - return genotypeCounts[evalType.ordinal()][compType.ordinal()]; - } - } - - class SiteConcordanceTable { - - private int[] siteConcordance; - - public SiteConcordanceTable() { - siteConcordance = new int[SiteConcordanceType.values().length]; - } - - public void update(VariantContext evalVC, VariantContext truthVC) { - SiteConcordanceType matchType = getMatchType(evalVC,truthVC); - siteConcordance[matchType.ordinal()]++; - } - - @Requires({"evalVC != null","truthVC != null"}) - private SiteConcordanceType getMatchType(VariantContext evalVC, VariantContext truthVC) { - return SiteConcordanceType.getConcordanceType(evalVC,truthVC); - } - - public int[] getSiteConcordance() { - return siteConcordance; - } - - public int get(SiteConcordanceType type) { - return getSiteConcordance()[type.ordinal()]; - } - } - - enum SiteConcordanceType { - ALLELES_MATCH, - EVAL_SUPERSET_TRUTH, - EVAL_SUBSET_TRUTH, - ALLELES_DO_NOT_MATCH, - EVAL_ONLY, - TRUTH_ONLY; - - public static SiteConcordanceType getConcordanceType(VariantContext eval, VariantContext truth) { - if ( eval.isMonomorphicInSamples() ) - return TRUTH_ONLY; - if ( truth.isMonomorphicInSamples() ) - return EVAL_ONLY; - - boolean evalSubsetTruth = GATKVariantContextUtils.allelesAreSubset(eval, truth); - boolean truthSubsetEval = GATKVariantContextUtils.allelesAreSubset(truth, eval); - - if ( evalSubsetTruth && truthSubsetEval ) - return ALLELES_MATCH; - - if ( evalSubsetTruth ) - return EVAL_SUBSET_TRUTH; - - if ( truthSubsetEval ) - return EVAL_SUPERSET_TRUTH; - - return ALLELES_DO_NOT_MATCH; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java deleted file mode 100644 index d26ab08f7..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - -/** - * Filters a lifted-over VCF file for ref bases that have been changed. - * - * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. - * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. - * - * FilteredLiftedVariants is intended to be the second of two processing steps for the liftover process. The first step is to run LiftoverVariants on your VCF file. - * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. - * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=0,stop=100)) -public class FilterLiftedVariants extends RodWalker { - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - private static final int MAX_VARIANT_SIZE = 100; - - @Output(doc="File to which variants should be written") - protected VariantContextWriter writer = null; - - private long failedLocs = 0, totalLocs = 0; - - public void initialize() { - String trackName = variantCollection.variants.getName(); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - - final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaDataInSortedOrder() : Collections.emptySet(), samples); - writer.writeHeader(vcfHeader); - } - - private void filterAndWrite(byte[] ref, VariantContext vc) { - - totalLocs++; - - boolean failed = false; - byte[] recordRef = vc.getReference().getBases(); - for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { - if ( recordRef[i] != ref[i] ) { - failed = true; - break; - } - } - - if ( failed ) - failedLocs++; - else - writer.add(vc); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - for ( VariantContext vc : VCs ) - filterAndWrite(ref.getBases(), vc); - - return 0; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { return 0; } - - public void onTraversalDone(Integer result) { - System.out.println("Filtered " + failedLocs + " records out of " + totalLocs + " total records."); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java deleted file mode 100755 index 724578a09..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ /dev/null @@ -1,600 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFHeader; - -import java.io.PrintStream; -import java.util.*; - -/** - * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets - * - *

- * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, - * and for each sample, the genotype-by-genotype counts (for instance, the number of sites at which a sample was - * called homozygous reference in the EVAL callset, but homozygous variant in the COMP callset). It outputs these - * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in - * the COMP) and metrics (such as NRD and NRS). - * - *

Input

- *

- * Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via - * the -eval and -comp arguments. - * - * (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and - * -cfe arguments, respectively. - *

- * - *

Output

- * Genotype Concordance writes a GATK report to the specified file (via -o) , consisting of multiple tables of counts - * and proportions. These tables may be optionally moltenized via the -moltenize argument. That is, the standard table - * - *
- *  Sample   NO_CALL_HOM_REF  NO_CALL_HET  NO_CALL_HOM_VAR   (...)
- *  NA12878       0.003        0.001            0.000        (...)
- *  NA12891       0.005        0.000            0.000        (...)
- *  
- * - * would instead be displayed - * - *
- *  NA12878  NO_CALL_HOM_REF   0.003
- *  NA12878  NO_CALL_HET       0.001
- *  NA12878  NO_CALL_HOM_VAR   0.000
- *  NA12891  NO_CALL_HOM_REF   0.005
- *  NA12891  NO_CALL_HET       0.000
- *  NA12891  NO_CALL_HOM_VAR   0.000
- *  (...)
- *  
- * - * - * These tables are constructed on a per-sample basis, and include counts of eval vs comp genotype states, and the - * number of times the alternate alleles between the eval and comp sample did not match up. - * - * In addition, Genotype Concordance produces site-level allelic concordance. For strictly bi-allelic VCFs, - * only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated, but where multi-allelic sites are involved - * counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated. - * - * For example, in the following situation - *
- *    eval:  ref - A   alt - C
- *    comp:  ref - A   alt - C,T
- *  
- * then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH. - * However, in the case where eval has both C and T alternate alleles, both must be observed in the genotypes - * (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If - * one of the alleles has no observations in the genotype fields of the eval, the site-level concordance is - * tabulated as though that allele were not present in the record. - * - *

Monomorphic Records

- * A site which has an alternate allele, but which is monomorphic in samples, is treated as not having been - * discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the comp VCF), or not at all - * (if no record exists in the comp VCF). - * - * That is, in the situation - *
- *   eval:  ref - A   alt - C   genotypes - 0/0  0/0  0/0 ... 0/0
- *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
- *  
- * is equivalent to - *
- *   eval:  ref - A   alt - .   genotypes - 0/0  0/0  0/0 ... 0/0
- *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
- *  
- * - * When a record is present in the comp VCF the *genotypes* for the monomorphic site will still be used to evaluate - * per-sample genotype concordance counts. - * - *

Filtered Records

- * Filtered records are treated as though they were not present in the VCF, unless -ignoreSiteFilters is provided, - * in which case all records are used. There is currently no way to assess concordance metrics on filtered sites - * exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them. - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { - - /** - * The callset you want to evaluate, typically this is where you'd put 'unassessed' callsets. - */ - @Input(fullName="eval",shortName="eval",doc="The variants and genotypes to evaluate",required=true) - RodBinding evalBinding; - - /** - * The callset you want to treat as 'truth'. Can also be of unknown quality for the sake of callset comparisons. - */ - @Input(fullName="comp",shortName="comp",doc="The variants and genotypes to compare against",required=true) - RodBinding compBinding; - - /** - * The FILTER field of the eval and comp VCFs will be ignored. If this flag is not included, all FILTER sites will - * be treated as not being present in the VCF. (That is, the genotypes will be assigned UNAVAILABLE, as distinct - * from NO_CALL). - */ - @Argument(fullName="ignoreFilters",doc="Filters will be ignored",required=false) - boolean ignoreFilters = false; - - /** - * A genotype level JEXL expression to apply to eval genotypes. Genotypes filtered in this way will be replaced by NO_CALL. - * For instance: -gfe 'GQ<20' will set to no-call any genotype with genotype quality less than 20. - */ - @Argument(shortName="gfe", fullName="genotypeFilterExpressionEval", doc="One or more criteria to use to set EVAL genotypes to no-call. "+ - "These genotype-level filters are only applied to the EVAL rod.", required=false) - public ArrayList genotypeFilterExpressionsEval = new ArrayList(); - - /** - * Identical to -gfe except the filter is applied to genotypes in the comp rod. - */ - @Argument(shortName="gfc", fullName="genotypeFilterExpressionComp", doc="One or more criteria to use to set COMP genotypes to no-call. "+ - "These genotype-level filters are only applied to the COMP rod.", required=false) - public ArrayList genotypeFilterExpressionsComp = new ArrayList(); - - /** - * Moltenize the count and proportion tables. Rather than moltenizing per-sample data into a 2x2 table, it is fully - * moltenized into elements. That is, WITHOUT this argument, each row of the table begins with the sample name and - * proceeds directly with counts/proportions of eval/comp counts (for instance HOM_REF/HOM_REF, HOM_REF/NO_CALL). - * - * If the Moltenize argument is given, the output will begin with a sample name, followed by the contrastive genotype - * type (such as HOM_REF/HOM_REF), followed by the count or proportion. This will significantly increase the number of - * rows. - */ - @Argument(shortName="moltenize",fullName="moltenize",doc="Molten rather than tabular output") - public boolean moltenize = false; - - @Output - PrintStream out; - - private List evalSamples; - private List compSamples; - private List evalJexls = null; - private List compJexls = null; - - // todo -- table with "proportion of overlapping sites" (not just eval/comp margins) [e.g. drop no-calls] - // (this will break all the integration tests of course, due to new formatting) - - public void initialize() { - evalJexls = initializeJexl(genotypeFilterExpressionsEval); - compJexls = initializeJexl(genotypeFilterExpressionsComp); - } - - private List initializeJexl(ArrayList genotypeFilterExpressions) { - ArrayList dummyNames = new ArrayList(genotypeFilterExpressions.size()); - int expCount = 1; - for ( String exp : genotypeFilterExpressions ) { - dummyNames.add(String.format("gfe%d",expCount++)); - } - return VariantContextUtils.initializeMatchExps(dummyNames, genotypeFilterExpressions); - } - - public ConcordanceMetrics reduceInit() { - Map headerMap = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(evalBinding,compBinding)); - VCFHeader evalHeader = headerMap.get(evalBinding.getName()); - evalSamples = evalHeader.getGenotypeSamples(); - VCFHeader compHeader = headerMap.get(compBinding.getName()); - compSamples = compHeader.getGenotypeSamples(); - return new ConcordanceMetrics(evalHeader,compHeader); - } - - - public List> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List> evalCompPair = new ArrayList>(3); - if ( tracker != null && ( - tracker.getValues(evalBinding,ref.getLocus()).size() > 0 || - tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) { - - List eval = tracker.getValues(evalBinding,ref.getLocus()); - List comp = tracker.getValues(compBinding,ref.getLocus()); - if ( eval.size() > 1 || comp.size() > 1 ) { - if ( noDuplicateTypes(eval) && noDuplicateTypes(comp) ) { - logger.info("Eval or Comp Rod at position " + ref.getLocus().toString() + " has multiple records. Resolving."); - evalCompPair = resolveMultipleRecords(eval,comp); - } else { - logger.warn("Eval or Comp Rod at position "+ref.getLocus().toString()+" has multiple records of the same type. This locus will be skipped."); - } - } else { - // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct. - // note that if there is no eval rod there must be a comp rod, and also the reverse - VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(comp.get(0),evalSamples); - VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(eval.get(0),compSamples); - evalContext = filterGenotypes(evalContext,ignoreFilters,evalJexls); - compContext = filterGenotypes(compContext,ignoreFilters,compJexls); - evalCompPair.add(new Pair(evalContext,compContext)); - } - } - - return evalCompPair; - } - - private boolean noDuplicateTypes(List vcList) { - HashSet types = new HashSet(vcList.size()); - for ( VariantContext vc : vcList ) { - VariantContext.Type type = vc.getType(); - if ( types.contains(type) ) - return false; - types.add(type); - } - - return true; - } - - /** - * The point of this method is to match up pairs of evals and comps by their type (or alternate alleles for mixed). - * Basically multiple records could exist for a site such as: - * Eval: 20 4000 A C - * Eval: 20 4000 A AC - * Comp: 20 4000 A C - * So for each eval, loop through the comps. If the types match, or for mixed types if eval alleles (non-emptily) - * intersect the comp alleles, pair them up and remove that comp records. - * Continue until we're out of evals or comps. This is n^2, but should rarely actually happen. - * - * The remaining unpaired records get paird with an empty contexts. So in the example above we'd get a list of: - * 1 - (20,4000,A/C | 20,4000,A/C) - * 2 - (20,4000,A/AC | Empty ) - * @param evalList - list of eval variant contexts - * @param compList - list of comp variant contexts - * @return resolved pairs of the input lists - */ - private List> resolveMultipleRecords(List evalList, List compList) { - List> resolvedPairs = new ArrayList>(evalList.size()+compList.size()); // oversized but w/e - List pairedEval = new ArrayList(evalList.size()); - for ( VariantContext eval : evalList ) { - VariantContext.Type evalType = eval.getType(); - Set evalAlleles = new HashSet(eval.getAlternateAlleles()); - VariantContext pairedComp = null; - for ( VariantContext comp : compList ) { - if ( evalType.equals(comp.getType()) ) { - pairedComp = comp; - break; - } else if ( eval.isMixed() || comp.isMixed() ) { - for ( Allele compAllele : comp.getAlternateAlleles() ) { - if ( evalAlleles.contains(compAllele) ) { - pairedComp = comp; - break; - } - } - } - } - if ( pairedComp != null ) { - compList.remove(pairedComp); - resolvedPairs.add(new Pair(filterGenotypes(eval,ignoreFilters,evalJexls),filterGenotypes(pairedComp,ignoreFilters,compJexls))); - pairedEval.add(eval); - if ( compList.size() < 1 ) - break; - } - } - evalList.removeAll(pairedEval); - for ( VariantContext unpairedEval : evalList ) { - resolvedPairs.add(new Pair(filterGenotypes(unpairedEval,ignoreFilters,evalJexls),createEmptyContext(unpairedEval,compSamples))); - } - - for ( VariantContext unpairedComp : compList ) { - resolvedPairs.add(new Pair(createEmptyContext(unpairedComp,evalSamples),filterGenotypes(unpairedComp,ignoreFilters,compJexls))); - } - - return resolvedPairs; - } - - public ConcordanceMetrics reduce(List> evalCompList, ConcordanceMetrics metrics) { - for ( Pair evalComp : evalCompList) - metrics.update(evalComp.getFirst(),evalComp.getSecond()); - return metrics; - } - - private static double repairNaN(double d) { - if ( Double.isNaN(d) ) { - return 0.0; - } - return d; - } - - public void onTraversalDone(ConcordanceMetrics metrics) { - // todo -- this is over 200 lines of code just to format the output and could use some serious cleanup - GATKReport report = new GATKReport(); - GATKReportTable concordanceCounts = new GATKReportTable("GenotypeConcordance_Counts","Per-sample concordance tables: comparison counts",2+GenotypeType.values().length*GenotypeType.values().length); - GATKReportTable concordanceEvalProportions = new GATKReportTable("GenotypeConcordance_EvalProportions", "Per-sample concordance tables: proportions of genotypes called in eval",2+GenotypeType.values().length*GenotypeType.values().length); - GATKReportTable concordanceCompProportions = new GATKReportTable("GenotypeConcordance_CompProportions", "Per-sample concordance tables: proportions of genotypes called in comp",2+GenotypeType.values().length*GenotypeType.values().length); - GATKReportTable concordanceSummary = new GATKReportTable("GenotypeConcordance_Summary","Per-sample summary statistics: NRS, NRD, and OGC",2); - GATKReportTable siteConcordance = new GATKReportTable("SiteConcordance_Summary","Site-level summary statistics",ConcordanceMetrics.SiteConcordanceType.values().length); - if ( moltenize ) { - concordanceCompProportions.addColumn("Sample","%s"); - concordanceCounts.addColumn("Sample","%s"); - concordanceEvalProportions.addColumn("Sample","%s"); - concordanceSummary.addColumn("Sample","%s"); - - concordanceCompProportions.addColumn("Eval_Genotype","%s"); - concordanceCounts.addColumn("Eval_Genotype","%s"); - concordanceEvalProportions.addColumn("Eval_Genotype","%s"); - concordanceSummary.addColumn("Non-Reference_Discrepancy","%.3f"); - - concordanceCompProportions.addColumn("Comp_Genotype","%s"); - concordanceCounts.addColumn("Comp_Genotype","%s"); - concordanceEvalProportions.addColumn("Comp_Genotype","%s"); - concordanceSummary.addColumn("Non-Reference_Sensitivity","%.3f"); - - concordanceCompProportions.addColumn("Proportion","%.3f"); - concordanceCounts.addColumn("Count","%d"); - concordanceEvalProportions.addColumn("Proportion","%.3f"); - concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); - - for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { - ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String rowKey = String.format("%s_%s_%s",entry.getKey(),evalType.toString(),compType.toString()); - concordanceCounts.set(rowKey,"Sample",entry.getKey()); - concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); - int count = table.get(evalType, compType); - concordanceCounts.set(rowKey,"Count",count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { - concordanceEvalProportions.set(rowKey,"Sample",entry.getKey()); - concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - } - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { - concordanceCompProportions.set(rowKey,"Sample",entry.getKey()); - concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - } - String mismatchKey = String.format("%s_%s",entry.getKey(),"Mismatching"); - concordanceCounts.set(mismatchKey,"Sample",entry.getKey()); - concordanceCounts.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCounts.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(mismatchKey,"Sample",entry.getKey()); - concordanceEvalProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(mismatchKey,"Sample",entry.getKey()); - concordanceCompProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(mismatchKey,"Count",table.getnMismatchingAlt()); - } - - String sampleKey = "ALL"; - ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String rowKey = String.format("%s_%s_%s",sampleKey,evalType.toString(),compType.toString()); - concordanceCounts.set(rowKey,"Sample",sampleKey); - concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); - int count = table.get(evalType, compType); - concordanceCounts.set(rowKey,"Count",count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { - concordanceEvalProportions.set(rowKey,"Sample",sampleKey); - concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - } - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { - concordanceCompProportions.set(rowKey,"Sample",sampleKey); - concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - } - String rowKey = String.format("%s_%s",sampleKey,"Mismatching"); - concordanceCounts.set(rowKey,"Sample",sampleKey); - concordanceCounts.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCounts.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(rowKey,"Sample",sampleKey); - concordanceEvalProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(rowKey,"Sample",sampleKey); - concordanceCompProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(rowKey,"Count",table.getnMismatchingAlt()); - - for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { - concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); - concordanceSummary.set(nrsEntry.getKey(),"Non-Reference_Sensitivity",nrsEntry.getValue()); - } - for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { - concordanceSummary.set(nrdEntry.getKey(),"Non-Reference_Discrepancy",nrdEntry.getValue()); - } - for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { - concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); - } - concordanceSummary.set("ALL_NRS_NRD","Sample","ALL"); - concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Sensitivity",metrics.getOverallNRS()); - concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Discrepancy",metrics.getOverallNRD()); - concordanceSummary.set("ALL_NRS_NRD","Overall_Genotype_Concordance",metrics.getOverallOGC()); - - - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.addColumn(type.toString(),"%d"); - } - - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); - } - - } else { - concordanceCompProportions.addColumn("Sample","%s"); - concordanceCounts.addColumn("Sample","%s"); - concordanceEvalProportions.addColumn("Sample","%s"); - concordanceSummary.addColumn("Sample","%s"); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String colKey = String.format("%s_%s", evalType.toString(), compType.toString()); - concordanceCounts.addColumn(colKey,"%d"); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) - concordanceEvalProportions.addColumn(colKey,"%.3f"); - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) - concordanceCompProportions.addColumn(colKey,"%.3f"); - } - } - concordanceEvalProportions.addColumn("Mismatching_Alleles","%.3f"); - concordanceCompProportions.addColumn("Mismatching_Alleles","%.3f"); - concordanceCounts.addColumn("Mismatching_Alleles","%d"); - concordanceSummary.addColumn("Non-Reference Sensitivity","%.3f"); - concordanceSummary.addColumn("Non-Reference Discrepancy","%.3f"); - concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.addColumn(type.toString(),"%d"); - } - - for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { - ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); - concordanceEvalProportions.set(entry.getKey(),"Sample",entry.getKey()); - concordanceCompProportions.set(entry.getKey(),"Sample",entry.getKey()); - concordanceCounts.set(entry.getKey(),"Sample",entry.getKey()); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); - int count = table.get(evalType, compType); - concordanceCounts.set(entry.getKey(),colKey,count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) - concordanceEvalProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) - concordanceCompProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - concordanceEvalProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(entry.getKey(),"Mismatching_Alleles",table.getnMismatchingAlt()); - } - - String rowKey = "ALL"; - concordanceCompProportions.set(rowKey,"Sample",rowKey); - concordanceEvalProportions.set(rowKey,"Sample",rowKey); - concordanceCounts.set(rowKey,"Sample",rowKey); - ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); - int count = table.get(evalType,compType); - concordanceCounts.set(rowKey,colKey,count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) - concordanceEvalProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) - concordanceCompProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - concordanceEvalProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(rowKey,"Mismatching_Alleles",table.getnMismatchingAlt()); - - for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { - concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); - concordanceSummary.set(nrsEntry.getKey(),"Non-Reference Sensitivity",nrsEntry.getValue()); - } - for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { - concordanceSummary.set(nrdEntry.getKey(),"Non-Reference Discrepancy",nrdEntry.getValue()); - } - for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { - concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); - } - concordanceSummary.set("ALL","Sample","ALL"); - concordanceSummary.set("ALL","Non-Reference Sensitivity",metrics.getOverallNRS()); - concordanceSummary.set("ALL","Non-Reference Discrepancy",metrics.getOverallNRD()); - concordanceSummary.set("ALL","Overall_Genotype_Concordance",metrics.getOverallOGC()); - - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); - } - } - - report.addTable(concordanceCompProportions); - report.addTable(concordanceEvalProportions); - report.addTable(concordanceCounts); - report.addTable(concordanceSummary); - report.addTable(siteConcordance); - - report.print(out); - } - - public VariantContext createEmptyContext(VariantContext other, List samples) { - VariantContextBuilder builder = new VariantContextBuilder(); - // set the alleles to be the same - builder.alleles(other.getAlleles()); - builder.loc(other.getChr(),other.getStart(),other.getEnd()); - // set all genotypes to empty - List genotypes = new ArrayList(samples.size()); - for ( String sample : samples ) - genotypes.add(GenotypeBuilder.create(sample, new ArrayList(0))); - builder.genotypes(genotypes); - return builder.make(); - } - - public VariantContext filterGenotypes(VariantContext context, boolean ignoreSiteFilter, List exps) { - if ( ! context.isFiltered() || ignoreSiteFilter ) { - List filteredGenotypes = new ArrayList(context.getNSamples()); - for ( Genotype g : context.getGenotypes() ) { - Map matchMap = VariantContextUtils.match(context, g, exps); - boolean filtered = false; - for ( Boolean b : matchMap.values() ) { - if ( b ) { - filtered = true; - break; - } - } - if ( filtered ) { - filteredGenotypes.add(GenotypeBuilder.create(g.getSampleName(),Arrays.asList(Allele.NO_CALL,Allele.NO_CALL),g.getExtendedAttributes())); - } else { - filteredGenotypes.add(g); - } - } - VariantContextBuilder builder = new VariantContextBuilder(context); - builder.genotypes(filteredGenotypes); - return builder.make(); - } - - VariantContextBuilder builder = new VariantContextBuilder(); - builder.alleles(Arrays.asList(context.getReference())); - builder.loc(context.getChr(),context.getStart(),context.getEnd()); - List newGeno = new ArrayList(context.getNSamples()); - for ( Genotype g : context.getGenotypes().iterateInSampleNameOrder() ) { - newGeno.add(GenotypeBuilder.create(g.getSampleName(),new ArrayList())); - } - builder.genotypes(newGeno); - return builder.make(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java deleted file mode 100644 index 9168d17f0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java +++ /dev/null @@ -1,299 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.util.*; - -/** - * Left-aligns indels from a variants file. - * - *

- * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be - * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to - * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. - * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. - * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. - * - *

Input

- *

- * A variant set to left-align and trim. - *

- * - *

Output

- *

- * A left-aligned VCF. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T LeftAlignAndTrimVariants \
- *   --variant input.vcf \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! -public class LeftAlignAndTrimVariants extends RodWalker { - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - /** - * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. - */ - @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) - protected boolean trimAlleles = false; - - /** - * If this argument is set, split multiallelic records and left-align individual alleles. - * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. - */ - @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) - protected boolean splitMultiallelics = false; - - - @Output(doc="File to which variants should be written") - protected VariantContextWriter baseWriter = null; - - private VariantContextWriter writer; - - private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! - public void initialize() { - String trackName = variantCollection.variants.getName(); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - - Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); - baseWriter.writeHeader(new VCFHeader(headerLines, samples)); - - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - - int changedSites = 0; - for ( final VariantContext vc : VCs ) { - // split first into biallelics, and optionally trim alleles to minimal representation - Pair result = new Pair(vc,0); // default value - if (splitMultiallelics) { - final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics( vc); - for (final VariantContext biallelicVC: vcList) { - final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true):biallelicVC); - result = alignAndWrite(v, ref); - writer.add(result.first); - changedSites += result.second; - } - } - else { - if (trimAlleles) - result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); - else - result = alignAndWrite(vc,ref); - writer.add(result.first); - changedSites += result.second; - - } - - } - - return changedSites; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - writer.close(); - System.out.println(result + " variants were aligned"); - } - - /** - * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be - * handled by calling routine. - * @param vc Input VC with variants to left align - * @param ref Reference context - * @return # of records left-aligned (0 or 1) and new VC. - */ - @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) - @Ensures({"result != null","result.first != null", "result.second >=0"}) - protected static Pair alignAndWrite(final VariantContext vc, final ReferenceContext ref) { - - final Pair retValue = new Pair(vc,0); - if (!vc.isIndel() || vc.isComplexIndel() ) { - return retValue; - } - - // get the indel length - final int indelLength; - if ( vc.isSimpleDeletion() ) - indelLength = vc.getReference().length() - 1; - else - indelLength = vc.getAlternateAllele(0).length() - 1; - - if ( indelLength > MAX_INDEL_LENGTH ) - return retValue; - - if (vc.getReference().getBases()[0] != vc.getAlternateAllele(0).getBases()[0]) - return retValue; - - final byte[] refSeq = ref.getBases(); - - // create an indel haplotype. - // - final int originalIndex = vc.getStart() - ref.getWindow().getStart() + 1; - if (originalIndex < 0 || originalIndex >= ref.getBases().length) - return retValue; - - final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); - - // create a CIGAR string to represent the event - ArrayList elements = new ArrayList(); - elements.add(new CigarElement(originalIndex, CigarOperator.M)); - elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I)); - elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); - Cigar originalCigar = new Cigar(elements); - - // left align the CIGAR - Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0, true); - - // update if necessary and write - if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { - int difference = originalIndex - newCigar.getCigarElement(0).getLength(); - VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); - //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); - - final int indelIndex = originalIndex-difference; - final byte[] newBases = new byte[indelLength + 1]; - newBases[0] = refSeq[indelIndex-1]; - System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 1, indelLength); - final Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); - newVC = updateAllele(newVC, newAllele); - // overwrite default return value with new left-aligned VC - retValue.first = newVC; - retValue.second = 1; - - } - return retValue; - } - - /** - * Make a haplotype from a given alt allele, using bases in input reference, index of an input reference - * @param vc Input VC - will use only alt allele from it - * @param ref Ref bases - * @param indexOfRef Index in ref where to create indel - * @param indelLength Indel length - * @return - */ - @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) - @Ensures("result != null") - private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { - byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; - - // add the bases before the indel - System.arraycopy(ref, 0, hap, 0, indexOfRef); - int currentPos = indexOfRef; - - // take care of the indel - if ( vc.isSimpleDeletion() ) { - indexOfRef += indelLength; - } else { - System.arraycopy(vc.getAlternateAllele(0).getBases(), 1, hap, currentPos, indelLength); - currentPos += indelLength; - } - - // add the bases after the indel - System.arraycopy(ref, indexOfRef, hap, currentPos, ref.length - indexOfRef); - - return hap; - } - - public static VariantContext updateAllele(final VariantContext vc, final Allele newAllele) { - // create a mapping from original allele to new allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - if ( newAllele.isReference() ) { - alleleMap.put(vc.getReference(), newAllele); - alleleMap.put(vc.getAlternateAllele(0), Allele.create(newAllele.getBases()[0], false)); - } else { - alleleMap.put(vc.getReference(), Allele.create(newAllele.getBases()[0], true)); - alleleMap.put(vc.getAlternateAllele(0), newAllele); - } - - // create new Genotype objects - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getAlleles() ) { - Allele newA = alleleMap.get(allele); - if ( newA == null ) - newA = Allele.NO_CALL; - newAlleles.add(newA); - } - newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); - } - - return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java deleted file mode 100644 index 8e5078f1f..000000000 --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java +++ /dev/null @@ -1,290 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.tools; - -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import org.apache.log4j.BasicConfigurator; -import org.apache.log4j.Level; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.FeatureReader; -import org.broad.tribble.index.IndexCreator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.writer.Options; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.io.*; -import java.util.*; - - -/** - * - * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples - * - *

- * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. - * This tool concatenates the scattered output VCF files. It assumes that: - * - All the input VCFs (or BCFs) contain the same samples in the same order. - * - The variants in each input file are from non-overlapping (scattered) intervals. - * - * When the input files are already sorted based on the intervals start positions, use -assumeSorted. - * - * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. - * - *

- * - *

Input

- *

- * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). - * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'. - * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. - *

- * - *

Output

- *

- * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. - * <\p> - * - *

Important note

- *

This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to - * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the - * classic "CommandLineGATK" arguments.

- * - *

Example

- *
- * java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
- *    -R ref.fasta \
- *    -V input1.vcf \
- *    -V input2.vcf \
- *    -out output.vcf \
- *    -assumeSorted
- * 
- * - * @author Ami Levy Moonshine - * @since Jan 2012 - */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP ) -public class CatVariants extends CommandLineProgram { - // setup the logging system, used by some codecs - private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); - - @Input(fullName = "reference", shortName = "R", doc = "genome reference file .fasta", required = true) - private File refFile = null; - - /** - * The VCF or BCF files to merge together - * - * CatVariants can take any number of -V arguments on the command line. Each -V argument - * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more - * efficiently if they are sorted based on the intervals and the assumeSorted argument is used. - * - */ - @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true) - private List variant = null; - - @Output(fullName = "outputFile", shortName = "out", doc = "output file name .vcf or .bcf", required = true) - private File outputFile = null; - - @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false) - private Boolean assumeSorted = false; - - @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false) - private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; - - @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false) - private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; - - /* - * print usage information - */ - private static void printUsage() { - System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants [sorted (optional)]"); - System.err.println(" The input files can be of type: VCF (ends in .vcf or .VCF)"); - System.err.println(" BCF2 (ends in .bcf or .BCF)"); - System.err.println(" Output file must be vcf or bcf file (.vcf or .bcf)"); - System.err.println(" if the input files are already sorted, the last argument can indicate that"); - } - - @Override - protected int execute() throws Exception { - //if(help){ - // printUsage(); - // return 1; - //} - - BasicConfigurator.configure(); - logger.setLevel(Level.INFO); - - final ReferenceSequenceFile ref; - try { - ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); - } catch ( Exception e ) { - throw new UserException("Couldn't load provided reference sequence file " + refFile, e); - } - - Comparator> positionComparator = new PositionComparator(); - - - //PriorityQueue>> queue = - // new PriorityQueue>>(2000, comparator); - Queue> priorityQueue; - if(assumeSorted) - priorityQueue = new LinkedList>(); - else - priorityQueue = new PriorityQueue>(10000, positionComparator); - - Iterator files = variant.iterator(); - File file; - while (files.hasNext()) { - file = files.next(); - if (!(file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF") || file.getName().endsWith(".bcf") || file.getName().endsWith(".BCF"))){ - System.err.println("File " + file.getAbsolutePath() + " should be .vcf or .bcf"); - printUsage(); - return 1; - } - if (assumeSorted){ - priorityQueue.add(new Pair(0,file)); - } - else{ - if (!file.exists()) { - throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); - } - FeatureReader reader; - boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); - if(useVCF) - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); - else - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); - Iterator it = reader.iterator(); - if(!it.hasNext()){ - System.err.println(String.format("File %s is empty. This file will be ignored",file.getAbsolutePath())); - continue; - } - VariantContext vc = it.next(); - int firstPosition = vc.getStart(); - reader.close(); - //queue.add(new Pair>(firstPosition,reader)); - priorityQueue.add(new Pair(firstPosition,file)); - } - - } - - if (!(outputFile.getName().endsWith(".vcf") || outputFile.getName().endsWith(".VCF"))){ - throw new UserException(String.format("Output file %s should be .vcf", outputFile)); - } - - FileOutputStream outputStream = new FileOutputStream(outputFile); - EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); - final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile); - final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); - - boolean firstFile = true; - int count =0; - //while(!queue.isEmpty()){ - while(!priorityQueue.isEmpty() ){ - count++; - //FeatureReader reader = queue.remove().getSecond(); - file = priorityQueue.remove().getSecond(); - if (!file.exists()) { - throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); - } - FeatureReader reader; - boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); - if(useVCF) - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); - else - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); - - if(count%10 ==0) - System.out.print(count); - else - System.out.print("."); - if (firstFile){ - VCFHeader header = (VCFHeader)reader.getHeader(); - outputWriter.writeHeader(header); - firstFile = false; - } - - Iterator it = reader.iterator(); - - while (it.hasNext()){ - VariantContext vc = it.next(); - outputWriter.add(vc); - } - - reader.close(); - - } - System.out.println(); - - outputStream.close(); - outputWriter.close(); - - return 0; - } - - - public static void main(String[] args){ - try { - CatVariants instance = new CatVariants(); - start(instance, args); - System.exit(CommandLineProgram.result); - } catch ( UserException e ) { - printUsage(); - exitSystemWithUserError(e); - } catch ( Exception e ) { - exitSystemWithError(e); - } - } - - private static class PositionComparator implements Comparator> { - - @Override - public int compare(Pair p1, Pair p2) { - int startPositionP1 = p1.getFirst(); - int startPositionP2 = p2.getFirst(); - if (startPositionP1 == startPositionP2) - return 0; - return startPositionP1 < startPositionP2 ? -1 : 1 ; - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java deleted file mode 100644 index 9823e524a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java +++ /dev/null @@ -1,95 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import it.unimi.dsi.fastutil.objects.Object2ObjectMap; -import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; - -import java.util.*; - -/** - * Utility class for handling deprecated tools gracefully - * - * @author vdauwera - * @since 3/11/13 - */ -public class DeprecatedToolChecks { - - // Mapping from walker name to major version number where the walker first disappeared and optional replacement options - private static Object2ObjectMap deprecatedGATKWalkers = new Object2ObjectOpenHashMap(); - static { - // Indicate recommended replacement in parentheses if applicable - deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); - deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)"); - deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)"); - deprecatedGATKWalkers.put("CountBestAlignments", "2.2 (no replacement)"); - deprecatedGATKWalkers.put("SomaticIndelDetector", "2.0 (replaced by the standalone tool Indelocator; see Cancer Tools documentation)"); - } - - // Mapping from walker name to major version number where the walker first disappeared and optional replacement options - private static Object2ObjectMap deprecatedGATKAnnotations = new Object2ObjectOpenHashMap(); - static { - // Same comments as for walkers - deprecatedGATKAnnotations.put("DepthOfCoverage", "2.4 (renamed to Coverage)"); - } - - /** - * Utility method to check whether a given walker has been deprecated in a previous GATK release - * - * @param walkerName the walker class name (not the full package) to check - */ - public static boolean isDeprecatedWalker(final String walkerName) { - return deprecatedGATKWalkers.containsKey(walkerName); - } - - /** - * Utility method to check whether a given annotation has been deprecated in a previous GATK release - * - * @param annotationName the annotation class name (not the full package) to check - */ - public static boolean isDeprecatedAnnotation(final String annotationName) { - return deprecatedGATKAnnotations.containsKey(annotationName); - } - - /** - * Utility method to pull up the version number at which a walker was deprecated and the suggested replacement, if any - * - * @param walkerName the walker class name (not the full package) to check - */ - public static String getWalkerDeprecationInfo(final String walkerName) { - return deprecatedGATKWalkers.get(walkerName).toString(); - } - - /** - * Utility method to pull up the version number at which an annotation was deprecated and the suggested replacement, if any - * - * @param annotationName the annotation class name (not the full package) to check - */ - public static String getAnnotationDeprecationInfo(final String annotationName) { - return deprecatedGATKAnnotations.get(annotationName).toString(); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java deleted file mode 100644 index 82c9fe751..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ /dev/null @@ -1,1518 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.distribution.ExponentialDistribution; -import org.apache.commons.math.distribution.ExponentialDistributionImpl; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.math.BigDecimal; -import java.util.*; - -/** - * MathUtils is a static class (no instantiation allowed!) with some useful math methods. - * - * @author Kiran Garimella - */ -public class MathUtils { - - /** - * Private constructor. No instantiating this class! - */ - private MathUtils() { - } - - public static final double[] log10Cache; - public static final double[] log10FactorialCache; - private static final double[] jacobianLogTable; - private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; - private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; - private static final double MAX_JACOBIAN_TOLERANCE = 8.0; - private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; - private static final int MAXN = 70_000; - private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients - - /** - * The smallest log10 value we'll emit from normalizeFromLog10 and other functions - * where the real-space value is 0.0. - */ - public static final double LOG10_P_OF_ZERO = -1000000.0; - public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); - public static final double LOG_ONE_HALF = -Math.log10(2.0); - public static final double LOG_ONE_THIRD = -Math.log10(3.0); - private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); - private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); - - static { - log10Cache = new double[LOG10_CACHE_SIZE]; - log10FactorialCache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; - - log10Cache[0] = Double.NEGATIVE_INFINITY; - log10FactorialCache[0] = 0.0; - for (int k = 1; k < LOG10_CACHE_SIZE; k++) { - log10Cache[k] = Math.log10(k); - log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; - } - - for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); - - } - } - - /** - * Get a random int between min and max (inclusive) using the global GATK random number generator - * - * @param min lower bound of the range - * @param max upper bound of the range - * @return a random int >= min and <= max - */ - public static int randomIntegerInRange( final int min, final int max ) { - return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; - } - - // A fast implementation of the Math.round() method. This method does not perform - // under/overflow checking, so this shouldn't be used in the general case (but is fine - // if one is already make those checks before calling in to the rounding). - public static int fastRound(final double d) { - return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); - } - - public static double approximateLog10SumLog10(final double[] vals) { - return approximateLog10SumLog10(vals, vals.length); - } - - public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { - - final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); - double approxSum = vals[maxElementIndex]; - - for (int i = 0; i < endIndex; i++) { - if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) - continue; - - final double diff = approxSum - vals[i]; - if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) { - // See notes from the 2-inout implementation below - final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding - approxSum += MathUtils.jacobianLogTable[ind]; - } - } - - return approxSum; - } - - public static double approximateLog10SumLog10(final double a, final double b, final double c) { - return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); - } - - public static double approximateLog10SumLog10(double small, double big) { - // make sure small is really the smaller value - if (small > big) { - final double t = big; - big = small; - small = t; - } - - if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) - return big; - - final double diff = big - small; - if (diff >= MathUtils.MAX_JACOBIAN_TOLERANCE) - return big; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup with integer quantization - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding - return big + MathUtils.jacobianLogTable[ind]; - } - - public static double sum(final double[] values) { - double s = 0.0; - for (double v : values) - s += v; - return s; - } - - public static long sum(final int[] x) { - long total = 0; - for (int v : x) - total += v; - return total; - } - - public static int sum(final byte[] x) { - int total = 0; - for (byte v : x) - total += (int)v; - return total; - } - - public static double percentage(int x, int base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static double ratio(final int num, final int denom) { - if ( denom > 0 ) { - return ((double) num)/denom; - } else { - if ( num == 0 && denom == 0) { - return 0.0; - } else { - throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); - } - } - } - - public static double ratio(final long num, final long denom) { - if ( denom > 0L ) { - return ((double) num)/denom; - } else { - if ( num == 0L && denom == 0L ) { - return 0.0; - } else { - throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); - } - } - } - - /** - * Converts a real space array of numbers (typically probabilities) into a log10 array - * - * @param prRealSpace - * @return - */ - public static double[] toLog10(final double[] prRealSpace) { - double[] log10s = new double[prRealSpace.length]; - for (int i = 0; i < prRealSpace.length; i++) { - log10s[i] = Math.log10(prRealSpace[i]); - } - return log10s; - } - - public static double log10sumLog10(final double[] log10p, final int start) { - return log10sumLog10(log10p, start, log10p.length); - } - - public static double log10sumLog10(final double[] log10p,final int start,final int finish) { - double sum = 0.0; - - double maxValue = arrayMax(log10p, finish); - if(maxValue == Double.NEGATIVE_INFINITY) - return maxValue; - - for (int i = start; i < finish; i++) { - if ( Double.isNaN(log10p[i]) || log10p[i] == Double.POSITIVE_INFINITY ) { - throw new IllegalArgumentException("log10p: Values must be non-infinite and non-NAN"); - } - sum += Math.pow(10.0, log10p[i] - maxValue); - } - - return Math.log10(sum) + maxValue; - } - - public static double sumLog10(final double[] log10values) { - return Math.pow(10.0, log10sumLog10(log10values)); - } - - public static double log10sumLog10(final double[] log10values) { - return log10sumLog10(log10values, 0); - } - - public static boolean wellFormedDouble(final double val) { - return !Double.isInfinite(val) && !Double.isNaN(val); - } - - public static double bound(final double value, final double minBoundary, final double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); - } - - public static boolean isBounded(final double val, final double lower, final double upper) { - return val >= lower && val <= upper; - } - - public static boolean isPositive(final double val) { - return !isNegativeOrZero(val); - } - - public static boolean isPositiveOrZero(final double val) { - return isBounded(val, 0.0, Double.POSITIVE_INFINITY); - } - - public static boolean isNegativeOrZero(final double val) { - return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); - } - - public static boolean isNegative(final double val) { - return !isPositiveOrZero(val); - } - - /** - * Compares double values for equality (within 1e-6), or inequality. - * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. - */ - public static byte compareDoubles(final double a, final double b) { - return compareDoubles(a, b, 1e-6); - } - - /** - * Compares double values for equality (within epsilon), or inequality. - * - * @param a the first double value - * @param b the second double value - * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. - */ - public static byte compareDoubles(final double a, final double b, final double epsilon) { - if (Math.abs(a - b) < epsilon) { - return 0; - } - if (a > b) { - return -1; - } - return 1; - } - - /** - * Calculate f(x) = Normal(x | mu = mean, sigma = sd) - * @param mean the desired mean of the Normal distribution - * @param sd the desired standard deviation of the Normal distribution - * @param x the value to evaluate - * @return a well-formed double - */ - public static double normalDistribution(final double mean, final double sd, final double x) { - if( sd < 0 ) - throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); - if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) - throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); - double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); - return a * b; - } - - /** - * Calculate f(x) = log10 ( Normal(x | mu = mean, sigma = sd) ) - * @param mean the desired mean of the Normal distribution - * @param sd the desired standard deviation of the Normal distribution - * @param x the value to evaluate - * @return a well-formed double - */ - - public static double normalDistributionLog10(final double mean, final double sd, final double x) { - if( sd < 0 ) - throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); - if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) - throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); - final double a = -1.0 * Math.log10(sd * SQUARE_ROOT_OF_TWO_TIMES_PI); - final double b = -1.0 * (square(x - mean) / (2.0 * square(sd))) / NATURAL_LOG_OF_TEN; - return a + b; - } - - /** - * Calculate f(x) = x^2 - * @param x the value to square - * @return x * x - */ - public static double square(final double x) { - return x * x; - } - - /** - * Calculates the log10 of the binomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k number of successes - * @return the log10 of the binomial coefficient - */ - public static double binomialCoefficient(final int n, final int k) { - return Math.pow(10, log10BinomialCoefficient(n, k)); - } - - /** - * @see #binomialCoefficient(int, int) with log10 applied to result - */ - public static double log10BinomialCoefficient(final int n, final int k) { - if ( n < 0 ) { - throw new IllegalArgumentException("n: Must have non-negative number of trials"); - } - if ( k > n || k < 0 ) { - throw new IllegalArgumentException("k: Must have non-negative number of successes, and no more successes than number of trials"); - } - - return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); - } - - /** - * Computes a binomial probability. This is computed using the formula - *

- * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - *

- * where n is the number of trials, k is the number of successes, and p is the probability of success - * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. - */ - public static double binomialProbability(final int n, final int k, final double p) { - return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); - } - - /** - * @see #binomialProbability(int, int, double) with log10 applied to result - */ - public static double log10BinomialProbability(final int n, final int k, final double log10p) { - if ( log10p > 1e-18 ) - throw new IllegalArgumentException("log10p: Log-probability must be 0 or less"); - double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); - return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); - } - - /** - * @see #binomialProbability(int, int, double) with p=0.5 - */ - public static double binomialProbability(final int n, final int k) { - return Math.pow(10, log10BinomialProbability(n, k)); - } - - /** - * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result - */ - public static double log10BinomialProbability(final int n, final int k) { - return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); - } - - /** A memoization container for {@link #binomialCumulativeProbability(int, int, int)}. Synchronized to accomodate multithreading. */ - private static final Map BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE = - Collections.synchronizedMap(new LRUCache(10_000)); - - /** - * Primitive integer-triplet bijection into long. Returns null when the bijection function fails (in lieu of an exception), which will - * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a - * utility function. - */ - static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { - if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { - return null; - } else { - long result = 0; - result += (short) one; - result <<= 16; - result += (short) two; - result <<= 16; - result += (short) three; - return result; - } - } - - /** - * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * Assumes that the probability of a successful hit is fair (i.e. 0.5). - * - * This pure function is memoized because of its expensive BigDecimal calculations. - * - * @param n number of attempts for the number of hits - * @param k_start start (inclusive) of the cumulant sum (over hits) - * @param k_end end (inclusive) of the cumulant sum (over hits) - * @return - returns the cumulative probability - */ - public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { - if ( k_end > n ) - throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); - - // Fetch cached value, if applicable. - final Long memoizationKey = fastGenerateUniqueHashFromThreeIntegers(n, k_start, k_end); - final Double memoizationCacheResult; - if (memoizationKey != null) { - memoizationCacheResult = BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.get(memoizationKey); - } else { - memoizationCacheResult = null; - } - - final double result; - if (memoizationCacheResult != null) { - result = memoizationCacheResult; - } else { - double cumProb = 0.0; - double prevProb; - BigDecimal probCache = BigDecimal.ZERO; - - for (int hits = k_start; hits <= k_end; hits++) { - prevProb = cumProb; - final double probability = binomialProbability(n, hits); - cumProb += probability; - if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision - probCache = probCache.add(new BigDecimal(prevProb)); - cumProb = 0.0; - hits--; // repeat loop - // prevProb changes at start of loop - } - } - - result = probCache.add(new BigDecimal(cumProb)).doubleValue(); - if (memoizationKey != null) { - BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.put(memoizationKey, result); - } - } - return result; - } - - /** - * Calculates the log10 of the multinomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) - * @return - */ - public static double log10MultinomialCoefficient(final int n, final int[] k) { - if ( n < 0 ) - throw new IllegalArgumentException("n: Must have non-negative number of trials"); - double denominator = 0.0; - int sum = 0; - for (int x : k) { - if ( x < 0 ) - throw new IllegalArgumentException("x element of k: Must have non-negative observations of group"); - if ( x > n ) - throw new IllegalArgumentException("x element of k, n: Group observations must be bounded by k"); - denominator += log10Factorial(x); - sum += x; - } - if ( sum != n ) - throw new IllegalArgumentException("k and n: Sum of observations in multinomial must sum to total number of trials"); - return log10Factorial(n) - denominator; - } - - /** - * Computes the log10 of the multinomial distribution probability given a vector - * of log10 probabilities. Designed to prevent overflows even with very large numbers. - * - * @param n number of trials - * @param k array of number of successes for each possibility - * @param log10p array of log10 probabilities - * @return - */ - public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { - if (log10p.length != k.length) - throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); - double log10Prod = 0.0; - for (int i = 0; i < log10p.length; i++) { - if ( log10p[i] > 1e-18 ) - throw new IllegalArgumentException("log10p: Log-probability must be <= 0"); - log10Prod += log10p[i] * k[i]; - } - return log10MultinomialCoefficient(n, k) + log10Prod; - } - - /** - * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. - * This is computed using the formula: - *

- * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - *

- * where xi represents the number of times outcome i was observed, n is the number of total observations. - * In this implementation, the value of n is inferred as the sum over i of xi. - * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. - */ - public static double multinomialCoefficient(final int[] k) { - int n = 0; - for (int xi : k) { - n += xi; - } - - return Math.pow(10, log10MultinomialCoefficient(n, k)); - } - - /** - * Computes a multinomial probability efficiently avoiding overflow even for large numbers. - * This is computed using the formula: - *

- * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - *

- * where xi represents the number of times outcome i was observed, n is the number of total observations, and - * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is - * inferred as the sum over i of xi. - * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. - */ - public static double multinomialProbability(final int[] k, final double[] p) { - if (p.length != k.length) - throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); - - int n = 0; - double[] log10P = new double[p.length]; - for (int i = 0; i < p.length; i++) { - log10P[i] = Math.log10(p[i]); - n += k[i]; - } - return Math.pow(10, log10MultinomialProbability(n, k, log10P)); - } - - /** - * calculate the Root Mean Square of an array of integers - * - * @param x an byte[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final byte[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (int i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - /** - * calculate the Root Mean Square of an array of integers - * - * @param x an int[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final int[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (int i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - /** - * calculate the Root Mean Square of an array of doubles - * - * @param x a double[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final Double[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (Double i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - public static double rms(final Collection l) { - if (l.size() == 0) - return 0.0; - - double rms = 0.0; - for (int i : l) - rms += i * i; - rms /= l.size(); - return Math.sqrt(rms); - } - - public static double distanceSquared(final double[] x, final double[] y) { - double dist = 0.0; - for (int iii = 0; iii < x.length; iii++) { - dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); - } - return dist; - } - - public static double round(final double num, final int digits) { - double result = num * Math.pow(10.0, (double) digits); - result = Math.round(result); - result = result / Math.pow(10.0, (double) digits); - return result; - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @param takeLog10OfOutput if true, the output will be transformed back into log10 units - * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed - */ - public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { - return normalizeFromLog10(array, takeLog10OfOutput, false); - } - - /** - * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space - * - * @param array - * @param takeLog10OfOutput - * @param keepInLogSpace - * - * @return - */ - public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = arrayMax(array); - - // we may decide to just normalize in log space without converting to linear space - if (keepInLogSpace) { - for (int i = 0; i < array.length; i++) { - array[i] -= maxValue; - } - return array; - } - - // default case: go to linear space - double[] normalized = new double[array.length]; - - for (int i = 0; i < array.length; i++) - normalized[i] = Math.pow(10, array[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.length; i++) - sum += normalized[i]; - for (int i = 0; i < array.length; i++) { - double x = normalized[i] / sum; - if (takeLog10OfOutput) { - x = Math.log10(x); - if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) - x = array[i] - maxValue; - } - - normalized[i] = x; - } - - return normalized; - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - public static double[] normalizeFromLog10(final double[] array) { - return normalizeFromLog10(array, false); - } - - /** - * normalizes the real-space probability array. - * - * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok - * to have values in the array of > 1, or have the sum go above 0. - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - @Requires("array != null") - @Ensures({"result != null"}) - public static double[] normalizeFromRealSpace(final double[] array) { - if ( array.length == 0 ) - return array; - - final double sum = sum(array); - final double[] normalized = new double[array.length]; - if ( sum < 0.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); - for ( int i = 0; i < array.length; i++ ) { - normalized[i] = array[i] / sum; - } - return normalized; - } - - public static int maxElementIndex(final double[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final double[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int maxElementIndex(final int[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final byte[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final int[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int maxElementIndex(final byte[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int arrayMax(final int[] array) { - return array[maxElementIndex(array)]; - } - - - public static double arrayMax(final double[] array) { - return array[maxElementIndex(array)]; - } - - public static double arrayMax(final double[] array, final int endIndex) { - return array[maxElementIndex(array, endIndex)]; - } - - public static double arrayMin(final double[] array) { - return array[minElementIndex(array)]; - } - - public static int arrayMin(final int[] array) { - return array[minElementIndex(array)]; - } - - public static byte arrayMin(final byte[] array) { - return array[minElementIndex(array)]; - } - - /** - * Compute the min element of a List - * @param array a non-empty list of integer - * @return the min - */ - public static int arrayMin(final List array) { - if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); - int min = array.get(0); - for ( final int i : array ) - if ( i < min ) min = i; - return min; - } - - /** - * Compute the median element of the list of integers - * @param array a list of integers - * @return the median element - */ - public static > T median(final List array) { - /* TODO -- from Valentin - the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). - - But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] - - My suggestion for a solution is then: - - unify median and medianDoubles to public static T median(Collection) - check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. - relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) - In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) - */ - if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); - final int size = array.size(); - if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); - else if ( size == 1 ) return array.get(0); - else { - final ArrayList sorted = new ArrayList<>(array); - Collections.sort(sorted); - return sorted.get(size / 2); - } - } - - public static int minElementIndex(final double[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int minElementIndex(final byte[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int minElementIndex(final int[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int arrayMaxInt(final List array) { - if (array == null) - throw new IllegalArgumentException("Array cannot be null!"); - if (array.size() == 0) - throw new IllegalArgumentException("Array size cannot be 0!"); - - int m = array.get(0); - for (int e : array) - m = Math.max(m, e); - return m; - } - - public static int sum(final List list ) { - int sum = 0; - for ( Integer i : list ) { - sum += i; - } - return sum; - } - - public static double average(final List vals, final int maxI) { - long sum = 0L; - - int i = 0; - for (long x : vals) { - if (i > maxI) - break; - sum += x; - i++; - } - - return (1.0 * sum) / i; - } - - public static double average(final List vals) { - return average(vals, vals.size()); - } - - public static int countOccurrences(final char c, final String s) { - int count = 0; - for (int i = 0; i < s.length(); i++) { - count += s.charAt(i) == c ? 1 : 0; - } - return count; - } - - public static int countOccurrences(T x, List l) { - int count = 0; - for (T y : l) { - if (x.equals(y)) - count++; - } - - return count; - } - - public static int countOccurrences(byte element, byte[] array) { - int count = 0; - for (byte y : array) { - if (element == y) - count++; - } - - return count; - } - - public static int countOccurrences(final boolean element, final boolean[] array) { - int count = 0; - for (final boolean b : array) { - if (element == b) - count++; - } - - return count; - } - - - /** - * Returns n random indices drawn with replacement from the range 0..(k-1) - * - * @param n the total number of indices sampled from - * @param k the number of random indices to draw (with replacement) - * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates - */ - static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { - - ArrayList chosen_balls = new ArrayList(k); - for (int i = 0; i < k; i++) { - //Integer chosen_ball = balls[rand.nextInt(k)]; - chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); - //balls.remove(chosen_ball); - } - - return chosen_balls; - } - - /** - * Returns n random indices drawn without replacement from the range 0..(k-1) - * - * @param n the total number of indices sampled from - * @param k the number of random indices to draw (without replacement) - * @return a list of k random indices ranging from 0 to (n-1) without duplicates - */ - static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { - ArrayList chosen_balls = new ArrayList(k); - - for (int i = 0; i < n; i++) { - chosen_balls.add(i); - } - - Collections.shuffle(chosen_balls, GenomeAnalysisEngine.getRandomGenerator()); - - //return (ArrayList) chosen_balls.subList(0, k); - return new ArrayList(chosen_balls.subList(0, k)); - } - - /** - * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - * - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices - */ - static public ArrayList sliceListByIndices(final List indices, final List list) { - ArrayList subset = new ArrayList(); - - for (int i : indices) { - subset.add(list.get(i)); - } - - return subset; - } - - /** - * Given two log-probability vectors, compute log of vector product of them: - * in Matlab notation, return log10(10.*x'*10.^y) - * @param x vector 1 - * @param y vector 2 - * @return a double representing log (dotProd(10.^x,10.^y) - */ - public static double logDotProduct(final double [] x, final double[] y) { - if (x.length != y.length) - throw new ReviewedStingException("BUG: Vectors of different lengths"); - - double tmpVec[] = new double[x.length]; - - for (int k=0; k < tmpVec.length; k++ ) { - tmpVec[k] = x[k]+y[k]; - } - - return log10sumLog10(tmpVec); - - - - } - - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value. By default allows - * -Infinity values, as log10(0.0) == -Infinity. - * @return true if result is really well formed - */ - public static boolean goodLog10Probability(final double result) { - return goodLog10Probability(result, true); - } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @param allowNegativeInfinity should we consider a -Infinity value ok? - * @return true if result is really well formed - */ - public static boolean goodLog10Probability(final double result, final boolean allowNegativeInfinity) { - return result <= 0.0 && result != Double.POSITIVE_INFINITY && (allowNegativeInfinity || result != Double.NEGATIVE_INFINITY) && ! Double.isNaN(result); - } - - /** - * Checks that the result is a well-formed probability - * - * @param result a supposedly well-formed probability value - * @return true if result is really well formed - */ - public static boolean goodProbability(final double result) { - return result >= 0.0 && result <= 1.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } - - /** - * A utility class that computes on the fly average and standard deviation for a stream of numbers. - * The number of observations does not have to be known in advance, and can be also very big (so that - * it could overflow any naive summation-based scheme or cause loss of precision). - * Instead, adding a new number observed - * to a sample with add(observed) immediately updates the instance of this object so that - * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 - * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). - */ - public static class RunningAverage { - private double mean = 0.0; - private double s = 0.0; - private long obs_count = 0; - - public void add(double obs) { - obs_count++; - double oldMean = mean; - mean += (obs - mean) / obs_count; // update mean - s += (obs - oldMean) * (obs - mean); - } - - public void addAll(Collection col) { - for (Number o : col) { - add(o.doubleValue()); - } - } - - public double mean() { - return mean; - } - - public double stddev() { - return Math.sqrt(s / (obs_count - 1)); - } - - public double var() { - return s / (obs_count - 1); - } - - public long observationCount() { - return obs_count; - } - - public RunningAverage clone() { - RunningAverage ra = new RunningAverage(); - ra.mean = this.mean; - ra.s = this.s; - ra.obs_count = this.obs_count; - return ra; - } - - public void merge(RunningAverage other) { - if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all - this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); - this.s += other.s; - } - this.obs_count += other.obs_count; - } - } - - // - // useful common utility routines - // - - static public double max(double x0, double x1, double x2) { - double a = Math.max(x0, x1); - return Math.max(a, x2); - } - - /** - * Converts LN to LOG10 - * - * @param ln log(x) - * @return log10(x) - */ - public static double lnToLog10(final double ln) { - return ln * Math.log10(Math.E); - } - - /** - * Constants to simplify the log gamma function calculation. - */ - private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; - - /** - * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift - */ - private static final int HI(final double x) { - return (int) (Double.doubleToLongBits(x) >> 32); - } - - /** - * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift - */ - private static final int LO(final double x) { - return (int) Double.doubleToLongBits(x); - } - - /** - * Most efficent implementation of the lnGamma (FDLIBM) - * Use via the log10Gamma wrapper method. - */ - private static double lnGamma(final double x) { - double t, y, z, p, p1, p2, p3, q, r, w; - int i; - - int hx = HI(x); - int lx = LO(x); - - /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx & 0x7fffffff; - if (ix >= 0x7ff00000) - return Double.POSITIVE_INFINITY; - if ((ix | lx) == 0 || hx < 0) - return Double.NaN; - if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ - return -Math.log(x); - } - - /* purge off 1 and 2 */ - if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) - r = 0; - /* for x < 2.0 */ - else if (ix < 0x40000000) { - if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ - r = -Math.log(x); - if (ix >= 0x3FE76944) { - y = one - x; - i = 0; - } - else if (ix >= 0x3FCDA661) { - y = x - (tc - one); - i = 1; - } - else { - y = x; - i = 2; - } - } - else { - r = zero; - if (ix >= 0x3FFBB4C3) { - y = 2.0 - x; - i = 0; - } /* [1.7316,2] */ - else if (ix >= 0x3FF3B4C4) { - y = x - tc; - i = 1; - } /* [1.23,1.73] */ - else { - y = x - one; - i = 2; - } - } - - switch (i) { - case 0: - z = y * y; - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); - p = y * p1 + p2; - r += (p - 0.5 * y); - break; - case 1: - z = y * y; - w = z * y; - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); - p = z * p1 - (tt - w * (p2 + y * p3)); - r += (tf + p); - break; - case 2: - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); - r += (-0.5 * y + p1 / p2); - } - } - else if (ix < 0x40200000) { /* x < 8.0 */ - i = (int) x; - t = zero; - y = x - (double) i; - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); - r = half * y + p / q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch (i) { - case 7: - z *= (y + 6.0); /* FALLTHRU */ - case 6: - z *= (y + 5.0); /* FALLTHRU */ - case 5: - z *= (y + 4.0); /* FALLTHRU */ - case 4: - z *= (y + 3.0); /* FALLTHRU */ - case 3: - z *= (y + 2.0); /* FALLTHRU */ - r += Math.log(z); - break; - } - /* 8.0 <= x < 2**58 */ - } - else if (ix < 0x43900000) { - t = Math.log(x); - z = one / x; - y = z * z; - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); - r = (x - half) * (t - one) + w; - } - else - /* 2**58 <= x <= inf */ - r = x * (Math.log(x) - one); - return r; - } - - /** - * Calculates the log10 of the gamma function for x using the efficient FDLIBM - * implementation to avoid overflows and guarantees high accuracy even for large - * numbers. - * - * @param x the x parameter - * @return the log10 of the gamma function at x. - */ - public static double log10Gamma(final double x) { - return lnToLog10(lnGamma(x)); - } - - public static double factorial(final int x) { - // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value - return (double)Math.round(Math.pow(10, log10Factorial(x))); - } - - public static double log10Factorial(final int x) { - if (x >= log10FactorialCache.length || x < 0) - return log10Gamma(x + 1); - else - return log10FactorialCache[x]; - } - - /** - * Adds two arrays together and returns a new array with the sum. - * - * @param a one array - * @param b another array - * @return a new array with the sum of a and b - */ - @Requires("a.length == b.length") - @Ensures("result.length == a.length") - public static int[] addArrays(final int[] a, final int[] b) { - int[] c = new int[a.length]; - for (int i = 0; i < a.length; i++) - c[i] = a[i] + b[i]; - return c; - } - - /** Same routine, unboxed types for efficiency - * - * @param x First vector - * @param y Second vector - * @return Vector of same length as x and y so that z[k] = x[k]+y[k] - */ - public static double[] vectorSum(final double[]x, final double[] y) { - if (x.length != y.length) - throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); - - double[] result = new double[x.length]; - for (int k=0; k log10LinearRange(final int start, final int stop, final double eps) { - final LinkedList values = new LinkedList<>(); - final double log10range = Math.log10(stop - start); - - if ( start == 0 ) - values.add(0); - - double i = 0.0; - while ( i <= log10range ) { - final int index = (int)Math.round(Math.pow(10, i)) + start; - if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) - values.add(index); - i += eps; - } - - if ( values.peekLast() == null || values.peekLast() != stop ) - values.add(stop); - - return values; - } - - /** - * Compute in a numerical correct way the quantity log10(1-x) - * - * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow - * in 1-x when x is very small - * - * @param x a positive double value between 0.0 and 1.0 - * @return an estimate of log10(1-x) - */ - @Requires("x >= 0.0 && x <= 1.0") - @Ensures("result <= 0.0") - public static double log10OneMinusX(final double x) { - if ( x == 1.0 ) - return Double.NEGATIVE_INFINITY; - else if ( x == 0.0 ) - return 0.0; - else { - final double d = Math.log10(1 / x - 1) + Math.log10(x); - return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; - } - } - - /** - * Draw N random elements from list - * @param list - the list from which to draw randomly - * @param N - the number of elements to draw - */ - public static List randomSubset(final List list, final int N) { - if (list.size() <= N) { - return list; - } - - return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); - } - - /** - * Return the likelihood of observing the counts of categories having sampled a population - * whose categorial frequencies are distributed according to a Dirichlet distribution - * @param dirichletParams - params of the prior dirichlet distribution - * @param dirichletSum - the sum of those parameters - * @param counts - the counts of observation in each category - * @param countSum - the sum of counts (number of trials) - * @return - associated likelihood - */ - public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, - final int[] counts, final int countSum) { - if ( dirichletParams.length != counts.length ) { - throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); - } - // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) - double likelihood = log10MultinomialCoefficient(countSum,counts); - likelihood += log10Gamma(dirichletSum); - likelihood -= log10Gamma(dirichletSum+countSum); - for ( int idx = 0; idx < counts.length; idx++ ) { - likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); - likelihood -= log10Gamma(dirichletParams[idx]); - } - - return likelihood; - } - - public static double dirichletMultinomial(double[] params, int[] counts) { - return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); - } - - public static ExponentialDistribution exponentialDistribution( final double mean ) { - return new ExponentialDistributionImpl(mean); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java deleted file mode 100644 index c0d1df09d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ /dev/null @@ -1,389 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import net.sf.samtools.SAMUtils; - -/** - * QualityUtils is a static class (no instantiation allowed!) with some utility methods for manipulating - * quality scores. - * - * @author Kiran Garimella, Mark DePristo - * @since Way back - */ -public class QualityUtils { - /** - * Maximum quality score that can be encoded in a SAM/BAM file - */ - public final static byte MAX_SAM_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; - - - private final static double RAW_MIN_PHRED_SCALED_QUAL = Math.log10(Double.MIN_VALUE); - protected final static double MIN_PHRED_SCALED_QUAL = -10.0 * RAW_MIN_PHRED_SCALED_QUAL; - - /** - * bams containing quals above this value are extremely suspicious and we should warn the user - */ - public final static byte MAX_REASONABLE_Q_SCORE = 60; - - /** - * The lowest quality score for a base that is considered reasonable for statistical analysis. This is - * because Q 6 => you stand a 25% of being right, which means all bases are equally likely - */ - public final static byte MIN_USABLE_Q_SCORE = 6; - public final static int MAPPING_QUALITY_UNAVAILABLE = 255; - - /** - * Cached values for qual as byte calculations so they are very fast - */ - private static double qualToErrorProbCache[] = new double[256]; - private static double qualToProbLog10Cache[] = new double[256]; - - - static { - for (int i = 0; i < 256; i++) { - qualToErrorProbCache[i] = qualToErrorProb((double) i); - qualToProbLog10Cache[i] = Math.log10(1.0 - qualToErrorProbCache[i]); - } - } - - /** - * Private constructor. No instantiating this class! - */ - private QualityUtils() {} - - // ---------------------------------------------------------------------- - // - // These are all functions to convert a phred-scaled quality score to a probability - // - // ---------------------------------------------------------------------- - - /** - * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a discretized byte value, this function uses a cache so is very efficient - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a quality score (0-255) - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToProb(final byte qual) { - return 1.0 - qualToErrorProb(qual); - } - - /** - * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a double value, this function must call Math.pow so can be quite expensive - * - * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToProb(final double qual) { - if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); - return 1.0 - qualToErrorProb(qual); - } - - /** - * Convert a phred-scaled quality score to its log10 probability of being true (Q30 => log10(0.999)) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a double value, this function must call Math.pow so can be quite expensive - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) - * @return a probability (0.0-1.0) - */ - @Ensures("result <= 0.0") - public static double qualToProbLog10(final byte qual) { - return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. - } - - /** - * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a double value, this function must call Math.pow so can be quite expensive - * - * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToErrorProb(final double qual) { - if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); - return Math.pow(10.0, qual / -10.0); - } - - /** - * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a byte value, this function uses a cache so is very efficient - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a phred-scaled quality score encoded as a byte - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToErrorProb(final byte qual) { - return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. - } - - - /** - * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * The calculation is extremely efficient - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a phred-scaled quality score encoded as a byte - * @return a probability (0.0-1.0) - */ - @Ensures("result <= 0.0") - public static double qualToErrorProbLog10(final byte qual) { - return qualToErrorProbLog10((double)(qual & 0xFF)); - } - - /** - * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * The calculation is extremely efficient - * - * @param qual a phred-scaled quality score encoded as a double - * @return a probability (0.0-1.0) - */ - @Ensures("result <= 0.0") - public static double qualToErrorProbLog10(final double qual) { - if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); - return qual / -10.0; - } - - // ---------------------------------------------------------------------- - // - // Functions to convert a probability to a phred-scaled quality score - // - // ---------------------------------------------------------------------- - - /** - * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). - * - * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE - * and by 1 at the low-end. - * - * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) - * @return a quality score (0-MAX_SAM_QUAL_SCORE) - */ - public static byte errorProbToQual(final double errorRate) { - return errorProbToQual(errorRate, MAX_SAM_QUAL_SCORE); - } - - /** - * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). - * - * Note, this function caps the resulting quality score by the public static value MIN_REASONABLE_ERROR - * and by 1 at the low-end. - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) - * @return a quality score (0-maxQual) - */ - public static byte errorProbToQual(final double errorRate, final byte maxQual) { - if ( ! MathUtils.goodProbability(errorRate) ) throw new IllegalArgumentException("errorRate must be good probability but got " + errorRate); - final double d = Math.round(-10.0*Math.log10(errorRate)); - return boundQual((int)d, maxQual); - } - - /** - * @see #errorProbToQual(double, byte) with proper conversion of maxQual integer to a byte - */ - public static byte errorProbToQual(final double prob, final int maxQual) { - if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); - return errorProbToQual(prob, (byte)(maxQual & 0xFF)); - } - - /** - * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). - * - * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE - * and by 1 at the low-end. - * - * @param prob a probability (0.0-1.0) of being right - * @return a quality score (0-MAX_SAM_QUAL_SCORE) - */ - public static byte trueProbToQual(final double prob) { - return trueProbToQual(prob, MAX_SAM_QUAL_SCORE); - } - - /** - * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). - * - * Note, this function caps the resulting quality score by the min probability allowed (EPS). - * So for example, if prob is 1e-6, which would imply a Q-score of 60, and EPS is 1e-4, - * the result of this function is actually Q40. - * - * Note that the resulting quality score, regardless of EPS, is capped by MAX_SAM_QUAL_SCORE and - * bounded on the low-side by 1. - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param trueProb a probability (0.0-1.0) of being right - * @param maxQual the maximum quality score we are allowed to emit here, regardless of the error rate - * @return a phred-scaled quality score (0-maxQualScore) as a byte - */ - @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") - public static byte trueProbToQual(final double trueProb, final byte maxQual) { - if ( ! MathUtils.goodProbability(trueProb) ) throw new IllegalArgumentException("trueProb must be good probability but got " + trueProb); - final double lp = Math.round(-10.0*MathUtils.log10OneMinusX(trueProb)); - return boundQual((int)lp, maxQual); - } - - /** - * @see #trueProbToQual(double, byte) with proper conversion of maxQual to a byte - */ - public static byte trueProbToQual(final double prob, final int maxQual) { - if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); - return trueProbToQual(prob, (byte)(maxQual & 0xFF)); - } - - /** - * Convert a probability of being right to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param trueRate the probability of being right (0.0-1.0) - * @return a phred-scaled version of the error rate implied by trueRate - */ - @Ensures("result >= 0.0") - public static double phredScaleCorrectRate(final double trueRate) { - return phredScaleLog10ErrorRate(MathUtils.log10OneMinusX(trueRate)); - } - - /** - * Convert a log10 probability of being right to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param trueRateLog10 the log10 probability of being right (0.0-1.0). Can be -Infinity to indicate - * that the result is impossible in which MIN_PHRED_SCALED_QUAL is returned - * @return a phred-scaled version of the error rate implied by trueRate - */ - @Ensures("result >= 0.0") - public static double phredScaleLog10CorrectRate(final double trueRateLog10) { - return phredScaleCorrectRate(Math.pow(10.0, trueRateLog10)); - } - - /** - * Convert a probability of being wrong to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param errorRate the probability of being wrong (0.0-1.0) - * @return a phred-scaled version of the error rate - */ - @Ensures("result >= 0.0") - public static double phredScaleErrorRate(final double errorRate) { - return phredScaleLog10ErrorRate(Math.log10(errorRate)); - } - - /** - * Convert a log10 probability of being wrong to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param errorRateLog10 the log10 probability of being wrong (0.0-1.0). Can be -Infinity, in which case - * the result is MIN_PHRED_SCALED_QUAL - * @return a phred-scaled version of the error rate - */ - @Ensures("result >= 0.0") - public static double phredScaleLog10ErrorRate(final double errorRateLog10) { - if ( ! MathUtils.goodLog10Probability(errorRateLog10) ) throw new IllegalArgumentException("errorRateLog10 must be good probability but got " + errorRateLog10); - // abs is necessary for edge base with errorRateLog10 = 0 producing -0.0 doubles - return Math.abs(-10.0 * Math.max(errorRateLog10, RAW_MIN_PHRED_SCALED_QUAL)); - } - - // ---------------------------------------------------------------------- - // - // Routines to bound a quality score to a reasonable range - // - // ---------------------------------------------------------------------- - - /** - * Return a quality score that bounds qual by MAX_SAM_QUAL_SCORE and 1 - * - * @param qual the uncapped quality score as an integer - * @return the bounded quality score - */ - @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (MAX_SAM_QUAL_SCORE & 0xFF)") - public static byte boundQual(int qual) { - return boundQual(qual, MAX_SAM_QUAL_SCORE); - } - - /** - * Return a quality score that bounds qual by maxQual and 1 - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual the uncapped quality score as an integer. Can be < 0 (which may indicate an error in the - * client code), which will be brought back to 1, but this isn't an error, as some - * routines may use this functionality (BaseRecalibrator, for example) - * @param maxQual the maximum quality score, must be less < 255 - * @return the bounded quality score - */ - @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") - public static byte boundQual(final int qual, final byte maxQual) { - return (byte) (Math.max(Math.min(qual, maxQual & 0xFF), 1) & 0xFF); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java deleted file mode 100644 index 69a2f0c8e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.concurrent.TimeUnit; - -/** - * A useful simple system for timing code with nano second resolution - * - * Note that this code is not thread-safe. If you have a single timer - * being started and stopped by multiple threads you will need to protect the - * calls to avoid meaningless results of having multiple starts and stops - * called sequentially. - * - * User: depristo - * Date: Dec 10, 2010 - * Time: 9:07:44 AM - */ -public class SimpleTimer { - protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); - private final String name; - - /** - * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the - * sum of times between starts/restrats and stops. - */ - private long elapsedTimeNano = 0l; - - /** - * The start time of the last start/restart in nanoSeconds - */ - private long startTimeNano = 0l; - - /** - * Is this timer currently running (i.e., the last call was start/restart) - */ - private boolean running = false; - - /** - * Creates an anonymous simple timer - */ - public SimpleTimer() { - this("Anonymous"); - } - - /** - * Creates a simple timer named name - * @param name of the timer, must not be null - */ - public SimpleTimer(final String name) { - if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); - this.name = name; - } - - /** - * @return the name associated with this timer - */ - public synchronized String getName() { - return name; - } - - /** - * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to - * resetting the time to have no history at all. - * - * @return this object, for programming convenience - */ - @Ensures("elapsedTimeNano == 0l") - public synchronized SimpleTimer start() { - elapsedTimeNano = 0l; - return restart(); - } - - /** - * Starts the timer running, without resetting the elapsedTimeNano time. This function may be - * called without first calling start(). The only difference between start and restart - * is that start resets the elapsedTimeNano time, while restart does not. - * - * @return this object, for programming convenience - */ - public synchronized SimpleTimer restart() { - running = true; - startTimeNano = currentTimeNano(); - return this; - } - - /** - * @return is this timer running? - */ - public synchronized boolean isRunning() { - return running; - } - - /** - * @return A convenience function to obtain the current time in milliseconds from this timer - */ - public long currentTime() { - return System.currentTimeMillis(); - } - - /** - * @return A convenience function to obtain the current time in nanoSeconds from this timer - */ - public long currentTimeNano() { - return System.nanoTime(); - } - - /** - * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. - * - * It's ok to call stop on a timer that's not running. It has no effect on the timer. - * - * @return this object, for programming convenience - */ - @Requires("startTimeNano != 0l") - public synchronized SimpleTimer stop() { - if ( running ) { - running = false; - elapsedTimeNano += currentTimeNano() - startTimeNano; - } - return this; - } - - /** - * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently - * running, includes the difference from currentTime() and the start as well - * - * @return this time, in seconds - */ - public synchronized double getElapsedTime() { - return nanoToSecondsAsDouble(getElapsedTimeNano()); - } - - protected static double nanoToSecondsAsDouble(final long nano) { - return nano * NANO_TO_SECOND_DOUBLE; - } - - /** - * @see #getElapsedTime() but returns the result in nanoseconds - * - * @return the elapsed time in nanoseconds - */ - public synchronized long getElapsedTimeNano() { - return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; - } - - /** - * Add the elapsed time from toAdd to this elapsed time - * - * @param toAdd the timer whose elapsed time we want to add to this timer - */ - public synchronized void addElapsed(final SimpleTimer toAdd) { - elapsedTimeNano += toAdd.getElapsedTimeNano(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java deleted file mode 100644 index 8f6af0158..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ /dev/null @@ -1,466 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.activeregion; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Represents a single active region created by the Active Region Traversal for processing - * - * An active region is a single contiguous span of bases on the genome that should be operated - * on as a single unit for the active region traversal. The action may contains a list of - * reads that overlap the region (may because there may be no reads in the region). The region - * is tagged as being either active or inactive, depending on the probabilities provided by - * the isActiveProb results from the ART walker. Each region carries with it the - * exact span of the region (bases which are the core of the isActiveProbs from the walker) as - * well as an extended size, that includes the ART walker's extension size. Reads in the region - * provided by ART include all reads overlapping the extended span, not the raw span. - * - * User: rpoplin - * Date: 1/4/12 - */ -@Invariant({ - "extension >= 0", - "activeRegionLoc != null", - "genomeLocParser != null", - "spanIncludingReads != null", - "extendedLoc != null" -}) -public class ActiveRegion implements HasGenomeLocation { - /** - * The reads included in this active region. May be empty upon creation, and expand / contract - * as reads are added or removed from this region. - */ - private final List reads = new ArrayList(); - - /** - * An ordered list (by genomic coordinate) of the ActivityProfileStates that went - * into this active region. May be empty, which says that no supporting states were - * provided when this region was created. - */ - private final List supportingStates; - - /** - * The raw span of this active region, not including the active region extension - */ - private final GenomeLoc activeRegionLoc; - - /** - * The span of this active region on the genome, including the active region extension - */ - private final GenomeLoc extendedLoc; - - /** - * The extension, in bp, of this active region. - */ - private final int extension; - - /** - * A genomeLocParser so we can create genomeLocs - */ - private final GenomeLocParser genomeLocParser; - - /** - * Does this region represent an active region (all isActiveProbs above threshold) or - * an inactive region (all isActiveProbs below threshold)? - */ - private final boolean isActive; - - /** - * The span of this active region, including the bp covered by all reads in this - * region. This union of extensionLoc and the loc of all reads in this region. - * - * Must be at least as large as extendedLoc, but may be larger when reads - * partially overlap this region. - */ - private GenomeLoc spanIncludingReads; - - - /** - * Indicates whether the active region has been finalized - */ - private boolean hasBeenFinalized; - - /** - * Create a new ActiveRegion containing no reads - * - * @param activeRegionLoc the span of this active region - * @param supportingStates the states that went into creating this region, or null / empty if none are available. - * If not empty, must have exactly one state for each bp in activeRegionLoc - * @param isActive indicates whether this is an active region, or an inactve one - * @param genomeLocParser a non-null parser to let us create new genome locs - * @param extension the active region extension to use for this active region - */ - public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { - if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null"); - if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc); - if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); - if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension); - - this.activeRegionLoc = activeRegionLoc; - this.supportingStates = supportingStates == null ? Collections.emptyList() : new ArrayList(supportingStates); - this.isActive = isActive; - this.genomeLocParser = genomeLocParser; - this.extension = extension; - this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); - this.spanIncludingReads = extendedLoc; - - if ( ! this.supportingStates.isEmpty() ) { - if ( this.supportingStates.size() != activeRegionLoc.size() ) - throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size()); - GenomeLoc lastStateLoc = null; - for ( final ActivityProfileState state : this.supportingStates ) { - if ( lastStateLoc != null ) { - if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex()) - throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state); - } - lastStateLoc = state.getLoc(); - } - } - } - - /** - * Simple interface to create an active region that isActive without any profile state - */ - public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) { - this(activeRegionLoc, Collections.emptyList(), true, genomeLocParser, extension); - } - - @Override - public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); - } - - /** - * See #getActiveRegionReference but with padding == 0 - */ - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { - return getActiveRegionReference(referenceReader, 0); - } - - /** - * Get the reference bases from referenceReader spanned by the extended location of this active region, - * including additional padding bp on either side. If this expanded region would exceed the boundaries - * of the active region's contig, the returned result will be truncated to only include on-genome reference - * bases - * @param referenceReader the source of the reference genome bases - * @param padding the padding, in BP, we want to add to either side of this active region extended region - * @return a non-null array of bytes holding the reference bases in referenceReader - */ - @Ensures("result != null") - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference(referenceReader, padding, extendedLoc); - } - - /** - * See #getActiveRegionReference but using the span including regions not the extended span - */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { - return getFullReference(referenceReader, 0); - } - - /** - * See #getActiveRegionReference but using the span including regions not the extended span - */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference(referenceReader, padding, spanIncludingReads); - } - - /** - * Get the reference bases from referenceReader spanned by the extended location of this active region, - * including additional padding bp on either side. If this expanded region would exceed the boundaries - * of the active region's contig, the returned result will be truncated to only include on-genome reference - * bases - * @param referenceReader the source of the reference genome bases - * @param padding the padding, in BP, we want to add to either side of this active region extended region - * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for - * @return a non-null array of bytes holding the reference bases in referenceReader - */ - @Ensures("result != null") - public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { - if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); - if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); - if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); - if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); - - final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), - Math.max(1, genomeLoc.getStart() - padding), - Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); - - return reference; - } - - /** - * Get the raw span of this active region (excluding the extension) - * @return a non-null genome loc - */ - @Override - @Ensures("result != null") - public GenomeLoc getLocation() { return activeRegionLoc; } - - /** - * Get the span of this active region including the extension value - * @return a non-null GenomeLoc - */ - @Ensures("result != null") - public GenomeLoc getExtendedLoc() { return extendedLoc; } - - /** - * Get the span of this active region including the extension and the projects on the - * genome of all reads in this active region. That is, returns the bp covered by this - * region and all reads in the region. - * @return a non-null genome loc - */ - @Ensures("result != null") - public GenomeLoc getReadSpanLoc() { return spanIncludingReads; } - - /** - * Get the active profile states that went into creating this region, if possible - * @return an unmodifiable list of states that led to the creation of this region, or an empty - * list if none were provided - */ - @Ensures("result != null") - public List getSupportingStates() { - return Collections.unmodifiableList(supportingStates); - } - - /** - * Get the active region extension applied to this region - * - * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions - * - * @return the size in bp of the region extension - */ - @Ensures("result >= 0") - public int getExtension() { return extension; } - - /** - * Get an unmodifiable list of reads currently in this active region. - * - * The reads are sorted by their coordinate position - * - * @return an unmodifiable list of reads in this active region - */ - @Ensures("result != null") - public List getReads() { - return Collections.unmodifiableList(reads); - } - - /** - * Get the number of reads currently in this active region - * @return an integer >= 0 - */ - @Ensures("result >= 0") - public int size() { return reads.size(); } - - /** - * Add read to this active region - * - * Read must have alignment start >= than the last read currently in this active region. - * - * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region - * - * @param read a non-null GATKSAMRecord - */ - @Ensures("reads.size() == old(reads.size()) + 1") - public void add( final GATKSAMRecord read ) { - if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); - - final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); - if ( ! readOverlapsRegion(read) ) - throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc); - - spanIncludingReads = spanIncludingReads.union( readLoc ); - - if ( ! reads.isEmpty() ) { - final GATKSAMRecord lastRead = reads.get(size() - 1); - if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) - throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read); - - if ( read.getAlignmentStart() < lastRead.getAlignmentStart() ) - throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart()); - } - - reads.add( read ); - } - - /** - * Returns true if read would overlap the extended extent of this region - * @param read the read we want to test - * @return true if read can be added to this region, false otherwise - */ - public boolean readOverlapsRegion(final GATKSAMRecord read) { - final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); - return readLoc.overlapsP(extendedLoc); - } - - /** - * Add all reads to this active region - * @param reads a collection of reads to add to this active region - */ - public void addAll(final Collection reads) { - if ( reads == null ) throw new IllegalArgumentException("reads cannot be null"); - for ( final GATKSAMRecord read : reads ) - add(read); - } - - /** - * Clear all of the reads currently in this active region - */ - @Ensures("size() == 0") - public void clearReads() { - spanIncludingReads = extendedLoc; - reads.clear(); - } - - /** - * Remove all of the reads in readsToRemove from this active region - * @param readsToRemove the set of reads we want to remove - */ - public void removeAll( final Set readsToRemove ) { - final Iterator it = reads.iterator(); - spanIncludingReads = extendedLoc; - while ( it.hasNext() ) { - final GATKSAMRecord read = it.next(); - if ( readsToRemove.contains(read) ) - it.remove(); - else - spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); - } - } - - /** - * Is this region equal to other, excluding any reads in either region in the comparison - * @param other the other active region we want to test - * @return true if this region is equal, excluding any reads and derived values, to other - */ - protected boolean equalExceptReads(final ActiveRegion other) { - if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; - if ( isActive() != other.isActive()) return false; - if ( genomeLocParser != other.genomeLocParser ) return false; - if ( extension != other.extension ) return false; - if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; - return true; - } - - /** - * Does this region represent an active region (all isActiveProbs above threshold) or - * an inactive region (all isActiveProbs below threshold)? - */ - public boolean isActive() { - return isActive; - } - - /** - * Intersect this active region with the allowed intervals, returning a list of active regions - * that only contain locations present in intervals - * - * Note that the returned list may be empty, if this active region doesn't overlap the set at all - * - * Note that the resulting regions are all empty, regardless of whether the current active region has reads - * - * @param intervals a non-null set of intervals that are allowed - * @return an ordered list of active region where each interval is contained within intervals - */ - @Ensures("result != null") - protected List splitAndTrimToIntervals(final GenomeLocSortedSet intervals) { - final List allOverlapping = intervals.getOverlapping(getLocation()); - final List clippedRegions = new LinkedList(); - - for ( final GenomeLoc overlapping : allOverlapping ) { - clippedRegions.add(trim(overlapping, extension)); - } - - return clippedRegions; - } - - /** - * Trim this active to just the newExtent, producing a new active region without any reads that has only - * the extent of newExtend intersected with the current extent - * @param newExtent the new extend of the active region we want - * @param newExtension the extension size we want for the newly trimmed active region - * @return a non-null, empty active region - */ - public ActiveRegion trim(final GenomeLoc newExtent, final int newExtension) { - if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); - - final GenomeLoc subLoc = getLocation().intersect(newExtent); - final int subStart = subLoc.getStart() - getLocation().getStart(); - final int subEnd = subStart + subLoc.size(); - final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); - return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, newExtension ); - } - - /** - * Trim this active to no more than the newExtent, producing a new active region without any reads that - * attempts to provide the best possible representation of this active region covering the newExtent. - * - * The challenge here is that newExtent may (1) be larger than can be represented by this active region - * + its original extension and (2) the extension must be symmetric on both sides. This algorithm - * therefore determines how best to represent newExtent as a subset of the span of this - * region with a padding value that captures as much of the newExtent as possible. - * - * For example, suppose this active region is - * - * Active: 100-200 with extension of 50, so that the true span is 50-250 - * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 - * - * Here we represent the active region as a active region from 150-200 with 25 bp of padding. - * - * The overall constraint is that the active region can never exceed the original active region, and - * the extension is chosen to maximize overlap with the desired region - * - * @param newExtent the new extend of the active region we want - * @return a non-null, empty active region - */ - public ActiveRegion trim(final GenomeLoc newExtent) { - if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); - - final GenomeLoc subActive = getLocation().intersect(newExtent); - final int requiredOnRight = Math.max(newExtent.getStop() - subActive.getStop(), 0); - final int requiredOnLeft = Math.max(subActive.getStart() - newExtent.getStart(), 0); - final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); - - return new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); - } - - public void setFinalized(final boolean value) { - hasBeenFinalized = value; - } - - public boolean isFinalized() { - return hasBeenFinalized; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java deleted file mode 100644 index 836c16a7e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ /dev/null @@ -1,642 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.clipping; - -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; -import java.util.List; -import java.util.Stack; -import java.util.Vector; - -/** - * Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases - * of the read, plus an option extraInfo (useful for carrying info where needed). - *

- * Also holds the critical apply function that actually execute the clipping operation on a provided read, - * according to the wishes of the supplied ClippingAlgorithm enum. - */ -public class ClippingOp { - public final int start, stop; // inclusive - - public ClippingOp(int start, int stop) { - this.start = start; - this.stop = stop; - } - - - public int getLength() { - return stop - start + 1; - } - - /** - * Clips the bases in read according to this operation's start and stop. Uses the clipping - * representation used is the one provided by algorithm argument. - * - * @param algorithm clipping algorithm to use - * @param originalRead the read to be clipped - */ - public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { - GATKSAMRecord read; - try { - read = (GATKSAMRecord) originalRead.clone(); - } catch (CloneNotSupportedException e) { - throw new ReviewedStingException("Where did the clone go?"); - } - byte[] quals = read.getBaseQualities(); - byte[] bases = read.getReadBases(); - byte[] newBases = new byte[bases.length]; - byte[] newQuals = new byte[quals.length]; - - switch (algorithm) { - // important note: - // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 - // because you're not guaranteed to get a pointer to the actual array of bytes in the GATKSAMRecord - case WRITE_NS: - for (int i = 0; i < bases.length; i++) { - if (i >= start && i <= stop) { - newBases[i] = 'N'; - } - else { - newBases[i] = bases[i]; - } - } - read.setReadBases(newBases); - break; - case WRITE_Q0S: - for (int i = 0; i < quals.length; i++) { - if (i >= start && i <= stop) { - newQuals[i] = 0; - } - else { - newQuals[i] = quals[i]; - } - } - read.setBaseQualities(newQuals); - break; - case WRITE_NS_Q0S: - for (int i = 0; i < bases.length; i++) { - if (i >= start && i <= stop) { - newQuals[i] = 0; - newBases[i] = 'N'; - } - else { - newQuals[i] = quals[i]; - newBases[i] = bases[i]; - } - } - read.setBaseQualities(newBases); - read.setReadBases(newBases); - break; - case HARDCLIP_BASES: - read = hardClip(read, start, stop); - break; - - case SOFTCLIP_BASES: - if (read.getReadUnmappedFlag()) { - // we can't process unmapped reads - throw new UserException("Read Clipper cannot soft clip unmapped reads"); - } - - //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); - int myStop = stop; - if ((stop + 1 - start) == read.getReadLength()) { - // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone - //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); - //break; - myStop--; // just decrement stop - } - - if (start > 0 && myStop != read.getReadLength() - 1) - throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); - - Cigar oldCigar = read.getCigar(); - - int scLeft = 0, scRight = read.getReadLength(); - if (start == 0) - scLeft = myStop + 1; - else - scRight = start; - - Cigar newCigar = softClip(oldCigar, scLeft, scRight); - read.setCigar(newCigar); - - int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); - int newStart = read.getAlignmentStart() + newClippedStart; - read.setAlignmentStart(newStart); - - break; - - case REVERT_SOFTCLIPPED_BASES: - read = revertSoftClippedBases(read); - break; - - default: - throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); - } - - return read; - } - - private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { - GATKSAMRecord unclipped; - - // shallow copy of the read bases and quals should be fine here because they are immutable in the original read - try { - unclipped = (GATKSAMRecord) read.clone(); - } catch (CloneNotSupportedException e) { - throw new ReviewedStingException("Where did the clone go?"); - } - - Cigar unclippedCigar = new Cigar(); - int matchesCount = 0; - for (CigarElement element : read.getCigar().getCigarElements()) { - if (element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - matchesCount += element.getLength(); - else if (matchesCount > 0) { - unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); - matchesCount = 0; - unclippedCigar.add(element); - } else - unclippedCigar.add(element); - } - if (matchesCount > 0) - unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); - - unclipped.setCigar(unclippedCigar); - final int newStart = read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar); - unclipped.setAlignmentStart(newStart); - - if ( newStart <= 0 ) { - // if the start of the unclipped read occurs before the contig, - // we must hard clip away the bases since we cannot represent reads with - // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned) - return hardClip(unclipped, 0, - newStart); - } else { - return unclipped; - } - } - - /** - * Given a cigar string, get the number of bases hard or soft clipped at the start - */ - private int getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) { - int num = 0; - for (CigarElement e : __cigar.getCigarElements()) { - if (!e.getOperator().consumesReferenceBases()) { - if (e.getOperator().consumesReadBases()) { - num += e.getLength(); - } - } else { - break; - } - } - - int oldNum = 0; - int curReadCounter = 0; - - for (CigarElement e : __oldCigar.getCigarElements()) { - int curRefLength = e.getLength(); - int curReadLength = e.getLength(); - if (!e.getOperator().consumesReadBases()) { - curReadLength = 0; - } - - boolean truncated = false; - if (curReadCounter + curReadLength > num) { - curReadLength = num - curReadCounter; - curRefLength = num - curReadCounter; - truncated = true; - } - - if (!e.getOperator().consumesReferenceBases()) { - curRefLength = 0; - } - - curReadCounter += curReadLength; - oldNum += curRefLength; - - if (curReadCounter > num || truncated) { - break; - } - } - - return oldNum; - } - - /** - * Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin - */ - private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) { - if (__endClipBegin <= __startClipEnd) { - //whole thing should be soft clipped - int cigarLength = 0; - for (CigarElement e : __cigar.getCigarElements()) { - cigarLength += e.getLength(); - } - - Cigar newCigar = new Cigar(); - newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP)); - assert newCigar.isValid(null, -1) == null; - return newCigar; - } - - int curLength = 0; - Vector newElements = new Vector(); - for (CigarElement curElem : __cigar.getCigarElements()) { - if (!curElem.getOperator().consumesReadBases()) { - if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) { - newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); - } - continue; - } - - int s = curLength; - int e = curLength + curElem.getLength(); - if (e <= __startClipEnd || s >= __endClipBegin) { - //must turn this entire thing into a clip - newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP)); - } else if (s >= __startClipEnd && e <= __endClipBegin) { - //same thing - newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); - } else { - //we are clipping in the middle of this guy - CigarElement newStart = null; - CigarElement newMid = null; - CigarElement newEnd = null; - - int midLength = curElem.getLength(); - if (s < __startClipEnd) { - newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP); - midLength -= newStart.getLength(); - } - - if (e > __endClipBegin) { - newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP); - midLength -= newEnd.getLength(); - } - assert midLength >= 0; - if (midLength > 0) { - newMid = new CigarElement(midLength, curElem.getOperator()); - } - if (newStart != null) { - newElements.add(newStart); - } - if (newMid != null) { - newElements.add(newMid); - } - if (newEnd != null) { - newElements.add(newEnd); - } - } - curLength += curElem.getLength(); - } - - Vector finalNewElements = new Vector(); - CigarElement lastElement = null; - for (CigarElement elem : newElements) { - if (lastElement == null || lastElement.getOperator() != elem.getOperator()) { - if (lastElement != null) { - finalNewElements.add(lastElement); - } - lastElement = elem; - } else { - lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator()); - } - } - if (lastElement != null) { - finalNewElements.add(lastElement); - } - - Cigar newCigar = new Cigar(finalNewElements); - assert newCigar.isValid(null, -1) == null; - return newCigar; - } - - /** - * Hard clip bases from read, from start to stop in base coordinates - * - * If start == 0, then we will clip from the front of the read, otherwise we clip - * from the right. If start == 0 and stop == 10, this would clip out the first - * 10 bases of the read. - * - * Note that this function works with reads with negative alignment starts, in order to - * allow us to hardClip reads that have had their soft clips reverted and so might have - * negative alignment starts - * - * Works properly with reduced reads and insertion/deletion base qualities - * - * @param read a non-null read - * @param start a start >= 0 and < read.length - * @param stop a stop >= 0 and < read.length. - * @return a cloned version of read that has been properly trimmed down - */ - private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { - final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart(); - final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart(); - - if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math). - return GATKSAMRecord.emptyRead(read); - - // If the read is unmapped there is no Cigar string and neither should we create a new cigar string - CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); - - // the cigar may force a shift left or right (or both) in case we are left with insertions - // starting or ending the read after applying the hard clip on start/stop. - int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; - - System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); - System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); - - final GATKSAMRecord hardClippedRead; - try { - hardClippedRead = (GATKSAMRecord) read.clone(); - } catch (CloneNotSupportedException e) { - throw new ReviewedStingException("Where did the clone go?"); - } - - hardClippedRead.resetSoftStartAndEnd(); // reset the cached soft start and end because they may have changed now that the read was hard clipped. No need to calculate them now. They'll be lazily calculated on the next call to getSoftStart()/End() - hardClippedRead.setBaseQualities(newQuals); - hardClippedRead.setReadBases(newBases); - hardClippedRead.setCigar(cigarShift.cigar); - if (start == 0) - hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); - - if (read.hasBaseIndelQualities()) { - byte[] newBaseInsertionQuals = new byte[newLength]; - byte[] newBaseDeletionQuals = new byte[newLength]; - System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); - System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); - hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); - hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); - } - - if (read.isReducedRead()) { - final int[] reducedCounts = new int[newLength]; - System.arraycopy(read.getReducedReadCounts(), copyStart, reducedCounts, 0, newLength); - hardClippedRead.setReducedReadCounts(reducedCounts); - } - - return hardClippedRead; - - } - - @Requires({"!cigar.isEmpty()"}) - private CigarShift hardClipCigar(Cigar cigar, int start, int stop) { - Cigar newCigar = new Cigar(); - int index = 0; - int totalHardClipCount = stop - start + 1; - int alignmentShift = 0; // caused by hard clipping deletions - - // hard clip the beginning of the cigar string - if (start == 0) { - Iterator cigarElementIterator = cigar.getCigarElements().iterator(); - CigarElement cigarElement = cigarElementIterator.next(); - // Skip all leading hard clips - while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) { - totalHardClipCount += cigarElement.getLength(); - if (cigarElementIterator.hasNext()) - cigarElement = cigarElementIterator.next(); - else - throw new ReviewedStingException("Read is entirely hardclipped, shouldn't be trying to clip it's cigar string"); - } - // keep clipping until we hit stop - while (index <= stop) { - int shift = 0; - if (cigarElement.getOperator().consumesReadBases()) - shift = cigarElement.getLength(); - - // we're still clipping or just finished perfectly - if (index + shift == stop + 1) { - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); - newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); - } - // element goes beyond what we need to clip - else if (index + shift > stop + 1) { - int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1); - newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); - newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); - } - index += shift; - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, shift); - - if (index <= stop && cigarElementIterator.hasNext()) - cigarElement = cigarElementIterator.next(); - else - break; - } - - // add the remaining cigar elements - while (cigarElementIterator.hasNext()) { - cigarElement = cigarElementIterator.next(); - newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); - } - } - - // hard clip the end of the cigar string - else { - Iterator cigarElementIterator = cigar.getCigarElements().iterator(); - CigarElement cigarElement = cigarElementIterator.next(); - - // Keep marching on until we find the start - while (index < start) { - int shift = 0; - if (cigarElement.getOperator().consumesReadBases()) - shift = cigarElement.getLength(); - - // we haven't gotten to the start yet, keep everything as is. - if (index + shift < start) - newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); - - // element goes beyond our clip starting position - else { - int elementLengthAfterChopping = start - index; - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index)); - - // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - totalHardClipCount += elementLengthAfterChopping; - // otherwise, maintain what's left of this last operator - else - newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); - } - index += shift; - if (index < start && cigarElementIterator.hasNext()) - cigarElement = cigarElementIterator.next(); - else - break; - } - - // check if we are hard clipping indels - while (cigarElementIterator.hasNext()) { - cigarElement = cigarElementIterator.next(); - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); - - // if the read had a HardClip operator in the end, combine it with the Hard Clip we are adding - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - totalHardClipCount += cigarElement.getLength(); - } - newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); - } - return cleanHardClippedCigar(newCigar); - } - - /** - * Checks if a hard clipped cigar left a read starting or ending with insertions/deletions - * and cleans it up accordingly. - * - * @param cigar the original cigar - * @return an object with the shifts (see CigarShift class) - */ - private CigarShift cleanHardClippedCigar(Cigar cigar) { - Cigar cleanCigar = new Cigar(); - int shiftFromStart = 0; - int shiftFromEnd = 0; - Stack cigarStack = new Stack(); - Stack inverseCigarStack = new Stack(); - - for (CigarElement cigarElement : cigar.getCigarElements()) - cigarStack.push(cigarElement); - - for (int i = 1; i <= 2; i++) { - int shift = 0; - int totalHardClip = 0; - boolean readHasStarted = false; - boolean addedHardClips = false; - - while (!cigarStack.empty()) { - CigarElement cigarElement = cigarStack.pop(); - - if (!readHasStarted && -// cigarElement.getOperator() != CigarOperator.INSERTION && - cigarElement.getOperator() != CigarOperator.DELETION && - cigarElement.getOperator() != CigarOperator.HARD_CLIP) - readHasStarted = true; - - else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) - totalHardClip += cigarElement.getLength(); - - else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) - totalHardClip += cigarElement.getLength(); - - if (readHasStarted) { - if (i == 1) { - if (!addedHardClips) { - if (totalHardClip > 0) - inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); - addedHardClips = true; - } - inverseCigarStack.push(cigarElement); - } else { - if (!addedHardClips) { - if (totalHardClip > 0) - cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); - addedHardClips = true; - } - cleanCigar.add(cigarElement); - } - } - } - // first pass (i=1) is from end to start of the cigar elements - if (i == 1) { - shiftFromEnd = shift; - cigarStack = inverseCigarStack; - } - // second pass (i=2) is from start to end with the end already cleaned - else { - shiftFromStart = shift; - } - } - return new CigarShift(cleanCigar, shiftFromStart, shiftFromEnd); - } - - /** - * Compute the offset of the first "real" position in the cigar on the genome - * - * This is defined as a first position after a run of Hs followed by a run of Ss - * - * @param cigar A non-null cigar - * @return the offset (from 0) of the first on-genome base - */ - private int calcHardSoftOffset(final Cigar cigar) { - final List elements = cigar.getCigarElements(); - - int size = 0; - int i = 0; - while ( i < elements.size() && elements.get(i).getOperator() == CigarOperator.HARD_CLIP ) { - size += elements.get(i).getLength(); - i++; - } - while ( i < elements.size() && elements.get(i).getOperator() == CigarOperator.SOFT_CLIP ) { - size += elements.get(i).getLength(); - i++; - } - - return size; - } - - private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { - final int newShift = calcHardSoftOffset(newCigar); - final int oldShift = calcHardSoftOffset(oldCigar); - return newShift - oldShift; - } - - private int calculateHardClippingAlignmentShift(CigarElement cigarElement, int clippedLength) { - // Insertions should be discounted from the total hard clip count - if (cigarElement.getOperator() == CigarOperator.INSERTION) - return -clippedLength; - - // Deletions and Ns should be added to the total hard clip count (because we want to maintain the original alignment start) - else if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) - return cigarElement.getLength(); - - // There is no shift if we are not clipping an indel - return 0; - } - - private static class CigarShift { - private Cigar cigar; - private int shiftFromStart; - private int shiftFromEnd; - - private CigarShift(Cigar cigar, int shiftFromStart, int shiftFromEnd) { - this.cigar = cigar; - this.shiftFromStart = shiftFromStart; - this.shiftFromEnd = shiftFromEnd; - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java deleted file mode 100644 index eaefa3aba..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ /dev/null @@ -1,549 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.clipping; - -import com.google.java.contract.Requires; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.ArrayList; -import java.util.List; - -/** - * A comprehensive clipping tool. - * - * General Contract: - * - All clipping operations return a new read with the clipped bases requested, it never modifies the original read. - * - If a read is fully clipped, return an empty GATKSAMRecord, never null. - * - When hard clipping, add cigar operator H for every *reference base* removed (i.e. Matches, SoftClips and Deletions, but *not* insertions). See Hard Clipping notes for details. - * - * - * There are several types of clipping to use: - * - * Write N's: - * Change the bases to N's in the desired region. This can be applied anywhere in the read. - * - * Write Q0's: - * Change the quality of the bases in the desired region to Q0. This can be applied anywhere in the read. - * - * Write both N's and Q0's: - * Same as the two independent operations, put together. - * - * Soft Clipping: - * Do not change the read, just mark the reads as soft clipped in the Cigar String - * and adjust the alignment start and end of the read. - * - * Hard Clipping: - * Creates a new read without the hard clipped bases (and base qualities). The cigar string - * will be updated with the cigar operator H for every reference base removed (i.e. Matches, - * Soft clipped bases and deletions, but *not* insertions). This contract with the cigar - * is necessary to allow read.getUnclippedStart() / End() to recover the original alignment - * of the read (before clipping). - * - */ -public class ReadClipper { - final GATKSAMRecord read; - boolean wasClipped; - List ops = null; - - /** - * Initializes a ReadClipper object. - * - * You can set up your clipping operations using the addOp method. When you're ready to - * generate a new read with all the clipping operations, use clipRead(). - * - * Note: Use this if you want to set up multiple operations on the read using the ClippingOp - * class. If you just want to apply one of the typical modes of clipping, use the static - * clipping functions available in this class instead. - * - * @param read the read to clip - */ - public ReadClipper(final GATKSAMRecord read) { - this.read = read; - this.wasClipped = false; - } - - /** - * Add clipping operation to the read. - * - * You can add as many operations as necessary to this read before clipping. Beware that the - * order in which you add these operations matter. For example, if you hard clip the beginning - * of a read first then try to hard clip the end, the indices will have changed. Make sure you - * know what you're doing, otherwise just use the static functions below that take care of the - * ordering for you. - * - * Note: You only choose the clipping mode when you use clipRead() - * - * @param op a ClippingOp object describing the area you want to clip. - */ - public void addOp(ClippingOp op) { - if (ops == null) ops = new ArrayList(); - ops.add(op); - } - - /** - * Check the list of operations set up for this read. - * - * @return a list of the operations set up for this read. - */ - public List getOps() { - return ops; - } - - /** - * Check whether or not this read has been clipped. - * @return true if this read has produced a clipped read, false otherwise. - */ - public boolean wasClipped() { - return wasClipped; - } - - /** - * The original read. - * - * @return returns the read to be clipped (original) - */ - public GATKSAMRecord getRead() { - return read; - } - - /** - * Clips a read according to ops and the chosen algorithm. - * - * @param algorithm What mode of clipping do you want to apply for the stacked operations. - * @return the read with the clipping applied. - */ - public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { - if (ops == null) - return getRead(); - - GATKSAMRecord clippedRead = read; - for (ClippingOp op : getOps()) { - final int readLength = clippedRead.getReadLength(); - //check if the clipped read can still be clipped in the range requested - if (op.start < readLength) { - ClippingOp fixedOperation = op; - if (op.stop >= readLength) - fixedOperation = new ClippingOp(op.start, readLength - 1); - - clippedRead = fixedOperation.apply(algorithm, clippedRead); - } - } - wasClipped = true; - ops.clear(); - if ( clippedRead.isEmpty() ) - return GATKSAMRecord.emptyRead(clippedRead); - return clippedRead; - } - - - /** - * Hard clips the left tail of a read up to (and including) refStop using reference - * coordinates. - * - * @param refStop the last base to be hard clipped in the left tail of the read. - * @return a new read, without the left tail. - */ - @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip - private GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { - return hardClipByReferenceCoordinates(-1, refStop); - } - public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { - return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); - } - - - - /** - * Hard clips the right tail of a read starting at (and including) refStart using reference - * coordinates. - * - * @param refStart refStop the first base to be hard clipped in the right tail of the read. - * @return a new read, without the right tail. - */ - @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip - private GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) { - return hardClipByReferenceCoordinates(refStart, -1); - } - public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { - return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); - } - - /** - * Hard clips a read using read coordinates. - * - * @param start the first base to clip (inclusive) - * @param stop the last base to clip (inclusive) - * @return a new read, without the clipped bases - */ - @Requires({"start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read - "start == 0 || stop == read.getReadLength() - 1"}) // cannot clip the middle of the read - private GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { - if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) - return GATKSAMRecord.emptyRead(read); - - this.addOp(new ClippingOp(start, stop)); - return clipRead(ClippingRepresentation.HARDCLIP_BASES); - } - public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { - return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); - } - - - /** - * Hard clips both tails of a read. - * Left tail goes from the beginning to the 'left' coordinate (inclusive) - * Right tail goes from the 'right' coordinate (inclusive) until the end of the read - * - * @param left the coordinate of the last base to be clipped in the left tail (inclusive) - * @param right the coordinate of the first base to be clipped in the right tail (inclusive) - * @return a new read, without the clipped bases - */ - @Requires({"left <= right", // tails cannot overlap - "left >= read.getAlignmentStart()", // coordinate has to be within the mapped read - "right <= read.getAlignmentEnd()"}) // coordinate has to be within the mapped read - private GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { - if (read.isEmpty() || left == right) - return GATKSAMRecord.emptyRead(read); - GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); - - // after clipping one tail, it is possible that the consequent hard clipping of adjacent deletions - // make the left cut index no longer part of the read. In that case, clip the read entirely. - if (left > leftTailRead.getAlignmentEnd()) - return GATKSAMRecord.emptyRead(read); - - ReadClipper clipper = new ReadClipper(leftTailRead); - return clipper.hardClipByReferenceCoordinatesLeftTail(left); - } - public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { - return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); - } - - - /** - * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the desired algorithm. - * - * This function will look for low quality tails and hard clip them away. A low quality tail - * ends when a base has base quality greater than lowQual. - * - * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...) - * @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped - * @return a new read without low quality tails - */ - private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) { - if (read.isEmpty()) - return read; - - final byte [] quals = read.getBaseQualities(); - final int readLength = read.getReadLength(); - int leftClipIndex = 0; - int rightClipIndex = readLength - 1; - - // check how far we can clip both sides - while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; - while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++; - - // if the entire read should be clipped, then return an empty read. - if (leftClipIndex > rightClipIndex) - return GATKSAMRecord.emptyRead(read); - - if (rightClipIndex < readLength - 1) { - this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1)); - } - if (leftClipIndex > 0 ) { - this.addOp(new ClippingOp(0, leftClipIndex - 1)); - } - return this.clipRead(algorithm); - } - - private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { - return this.clipLowQualEnds(ClippingRepresentation.HARDCLIP_BASES, lowQual); - } - public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { - return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); - } - public static GATKSAMRecord clipLowQualEnds(GATKSAMRecord read, byte lowQual, ClippingRepresentation algorithm) { - return (new ReadClipper(read)).clipLowQualEnds(algorithm, lowQual); - } - - - /** - * Will hard clip every soft clipped bases in the read. - * - * @return a new read without the soft clipped bases - */ - private GATKSAMRecord hardClipSoftClippedBases () { - if (read.isEmpty()) - return read; - - int readIndex = 0; - int cutLeft = -1; // first position to hard clip (inclusive) - int cutRight = -1; // first position to hard clip (inclusive) - boolean rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail - - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { - if (rightTail) { - cutRight = readIndex; - } - else { - cutLeft = readIndex + cigarElement.getLength() - 1; - } - } - else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) - rightTail = true; - - if (cigarElement.getOperator().consumesReadBases()) - readIndex += cigarElement.getLength(); - } - - // It is extremely important that we cut the end first otherwise the read coordinates change. - if (cutRight >= 0) - this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1)); - if (cutLeft >= 0) - this.addOp(new ClippingOp(0, cutLeft)); - - return clipRead(ClippingRepresentation.HARDCLIP_BASES); - } - public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { - return (new ReadClipper(read)).hardClipSoftClippedBases(); - } - - - /** - * Hard clip the read to the variable region (from refStart to refStop) - * - * @param read the read to be clipped - * @param refStart the beginning of the variant region (inclusive) - * @param refStop the end of the variant region (inclusive) - * @return the read hard clipped to the variant region - */ - public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop ) { - final int start = read.getAlignmentStart(); - final int stop = read.getAlignmentEnd(); - - // check if the read is contained in region - if (start <= refStop && stop >= refStart) { - if (start < refStart && stop > refStop) - return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1); - else if (start < refStart) - return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1); - else if (stop > refStop) - return hardClipByReferenceCoordinatesRightTail(read, refStop + 1); - return read; - } else - return GATKSAMRecord.emptyRead(read); - - } - public static List hardClipToRegion( final List reads, final int refStart, final int refStop ) { - final List returnList = new ArrayList( reads.size() ); - for( final GATKSAMRecord read : reads ) { - final GATKSAMRecord clippedRead = hardClipToRegion( read, refStart, refStop ); - if( !clippedRead.isEmpty() ) { - returnList.add( clippedRead ); - } - } - return returnList; - } - - /** - * Checks if a read contains adaptor sequences. If it does, hard clips them out. - * - * Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary() - * - * @return a new read without adaptor sequence - */ - private GATKSAMRecord hardClipAdaptorSequence () { - final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read); - - if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary)) - return read; - - return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); - } - public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { - return (new ReadClipper(read)).hardClipAdaptorSequence(); - } - - - /** - * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the end. - * - * @return a new read without leading insertions - */ - private GATKSAMRecord hardClipLeadingInsertions() { - if (read.isEmpty()) - return read; - - for(CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && - cigarElement.getOperator() != CigarOperator.INSERTION) - break; - - else if (cigarElement.getOperator() == CigarOperator.INSERTION) - this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); - - } - return clipRead(ClippingRepresentation.HARDCLIP_BASES); - } - public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) { - return (new ReadClipper(read)).hardClipLeadingInsertions(); - } - - - /** - * Turns soft clipped bases into matches - * @return a new read with every soft clip turned into a match - */ - private GATKSAMRecord revertSoftClippedBases() { - if (read.isEmpty()) - return read; - - this.addOp(new ClippingOp(0, 0)); - return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); - } - - /** - * Reverts ALL soft-clipped bases - * - * @param read the read - * @return the read with all soft-clipped bases turned into matches - */ - public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { - return (new ReadClipper(read)).revertSoftClippedBases(); - } - - /** - * Reverts only soft clipped bases with quality score greater than or equal to minQual - * - * todo -- Note: Will write a temporary field with the number of soft clips that were undone on each side (left: 'SL', right: 'SR') -- THIS HAS BEEN REMOVED TEMPORARILY SHOULD HAPPEN INSIDE THE CLIPPING ROUTINE! - * - * @param read the read - * @param minQual the mininum base quality score to revert the base (inclusive) - * @return a new read with high quality soft clips reverted - */ - public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read, byte minQual) { - return revertSoftClippedBases(hardClipLowQualitySoftClips(read, minQual)); - } - - /** - * Hard clips away soft clipped bases that are below the given quality threshold - * - * @param read the read - * @param minQual the mininum base quality score to revert the base (inclusive) - * @return a new read without low quality soft clipped bases - */ - public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) { - int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart(); - if (read.isEmpty() || nLeadingSoftClips > read.getReadLength()) - return GATKSAMRecord.emptyRead(read); - - byte [] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); - int left = -1; - - if (nLeadingSoftClips > 0) { - for (int i = nLeadingSoftClips - 1; i >= 0; i--) { - if (quals[i] >= minQual) - left = i; - else - break; - } - } - - int right = -1; - int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd(); - if (nTailingSoftClips > 0) { - for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength() ; i++) { - if (quals[i] >= minQual) - right = i; - else - break; - } - } - - GATKSAMRecord clippedRead = read; - if (right >= 0 && right + 1 < clippedRead.getReadLength()) // only clip if there are softclipped bases (right >= 0) and the first high quality soft clip is not the last base (right+1 < readlength) - clippedRead = hardClipByReadCoordinates(clippedRead, right+1, clippedRead.getReadLength()-1); // first we hard clip the low quality soft clips on the right tail - if (left >= 0 && left - 1 > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality soft clip is not the last base (left-1 > 0) - clippedRead = hardClipByReadCoordinates(clippedRead, 0, left-1); // then we hard clip the low quality soft clips on the left tail - - return clippedRead; - } - - /** - * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail - * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. - * - * Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the - * left of right tail) by specifying either refStart < 0 or refStop < 0. - * - * @param refStart first base to clip (inclusive) - * @param refStop last base to clip (inclusive) - * @return a new read, without the clipped bases - */ - @Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip - protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - if (read.isEmpty()) - return read; - - int start; - int stop; - - // Determine the read coordinate to start and stop hard clipping - if (refStart < 0) { - if (refStop < 0) - throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); - start = 0; - stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); - } - else { - if (refStop >= 0) - throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); - start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); - stop = read.getReadLength() - 1; - } - - if (start < 0 || stop > read.getReadLength() - 1) - throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); - - if ( start > stop ) - throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)", start, stop, read, read.getCigarString())); - - if ( start > 0 && stop < read.getReadLength() - 1) - throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); - - this.addOp(new ClippingOp(start, stop)); - GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); - this.ops = null; - return clippedRead; - } - - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java deleted file mode 100644 index 34705c4c9..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java +++ /dev/null @@ -1,277 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.codecs.sampileup; - -import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.util.ParsingUtils; - -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; - -/** - * Decoder for SAM pileup data. For GATK validation purposes only - * - *

- * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. - * It desribes the base-pair information at each chromosomal position. This format - * facilitates SNP/indel calling and brief alignment viewing by eyes. - *

- *

- * Each line consists of chromosome, 1-based coordinate, reference base, the - * number of reads covering the site, read bases and base qualities. At the - * read base column, a dot stands for a match to the reference base on the - * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch - * on the forward strand and `acgtn' for a mismatch on the reverse strand. - * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between - * this reference position and the next reference position. The length of the - * insertion is given by the integer in the pattern, followed by the inserted sequence. - *

- * - *

- *
See also: @see SAMTools project
- *
See also: @see Pileup format
- *

- * - *

File format example

- *
- *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
- *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
- *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
- *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
- *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
- *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
- *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
- *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
- * 
- * - * @author Matt Hanna - * @since 2009 - */ -public class SAMPileupCodec extends AsciiFeatureCodec { - // the number of tokens we expect to parse from a pileup line - private static final int expectedTokenCount = 10; - private static final char fldDelim = '\t'; - - // allocate once and don't ever bother creating them again: - private static final String baseA = "A"; - private static final String baseC = "C"; - private static final String baseG = "G"; - private static final String baseT = "T"; - private static final String emptyStr = ""; // we will use this for "reference" allele in insertions - - public SAMPileupCodec() { - super(SAMPileupFeature.class); - } - - public SAMPileupFeature decode(String line) { -// 0 1 2 3 4 5 6 7 -//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) -//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - String[] tokens = new String[expectedTokenCount]; - - // split the line - int count = ParsingUtils.split(line,tokens,fldDelim); - - // check to see if we've parsed the string into the right number of tokens (expectedTokenCount) - if (count != expectedTokenCount) - throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " + - "(expected = " + expectedTokenCount + ", saw = " + count + " on " + - "line = " + line + ")"); - - SAMPileupFeature feature = new SAMPileupFeature(); - - feature.setChr(tokens[0]); - feature.setStart(Integer.parseInt(tokens[1])); - - if(tokens[2].length() != 1) - throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line); - feature.setRef(Character.toUpperCase(tokens[2].charAt(0))); - - String observedString = tokens[3].toUpperCase(); // field 3 - feature.setFWDAlleles(new ArrayList(2)); - - feature.setConsensusConfidence(Double.parseDouble(tokens[4])); - feature.setVariantConfidence(Double.parseDouble(tokens[5])); - - if ( feature.getRef() == '*' ) { - parseIndels(observedString,feature) ; - if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); - else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! - } else { - parseBasesAndQuals(feature,tokens[8],tokens[9]); - // if the variant is a SNP or a reference base (i.e. no variant at all) - if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter"); - feature.setRefBases(tokens[2].toUpperCase()); - feature.setEnd(feature.getStart()); - - char ch = observedString.charAt(0); - - switch ( ch ) { - case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; - case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; - case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; - case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; - case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; - case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; - case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; - case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; - case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; - case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; - } - if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - feature.setVariantType(VariantType.SNP); - if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || - feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || - feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) - ) feature.setNumNonRef(1); - else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } - } - - return feature; - } - - @Override - public Object readActualHeader(LineIterator lineIterator) { - // No header for this format - return null; - } - - private void parseIndels(String genotype,SAMPileupFeature feature) { - String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit - - // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; - // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases - // from what we have recorded for the deletion allele (e.g. "-CAC") - boolean hasRefAllele = false; - - for ( int i = 0 ; i < obs.length ; i++ ) { - if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { - hasRefAllele = true; - feature.getFWDAlleles().add(emptyStr); - continue; - } - - String varBases = obs[i].toUpperCase(); - - switch ( obs[i].charAt(0) ) { - case '+': - if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL); - else feature.setVariantType(VariantType.INSERTION); - feature.setRefBases(emptyStr); - break; - case '-' : - if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL); - else feature.setVariantType(VariantType.DELETION); - feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele" - break; - default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); - } - feature.getFWDAlleles().add(varBases); - feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!! - } - if ( hasRefAllele ) { - // we got at least one ref. allele (out of two recorded) - if (feature.isReference()) { // both top theories are actually ref allele; - feature.setNumNonRef(0); // no observations of non-reference allele at all - feature.setRefBases(emptyStr); - } else { - feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left - } - } else { - // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we - // fail to set data in a consistent way. - if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1); - else feature.setNumNonRef(2); - } - // DONE with indels - - } - - private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals) - { - //System.out.printf("%s%n%s%n", bases, quals); - - // needs to convert the base string with it's . and , to the ref base - StringBuilder baseBuilder = new StringBuilder(); - StringBuilder qualBuilder = new StringBuilder(); - boolean done = false; - for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { - //System.out.printf("%d %d%n", i, j); - char c = (char)bases.charAt(i); - - switch ( c ) { - case '.': // matches reference - case ',': // matches reference - baseBuilder.append(feature.getRef()); - qualBuilder.append(quals.charAt(j++)); - break; - case '$': // end of read - break; - case '*': // end of indel? - j++; - break; - case '^': // mapping quality - i++; - break; - case '+': // start of indel - case '-': // start of indel - final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 - final String rest = bases.substring(i+1); - //System.out.printf("sub is %s%n", rest); - Matcher match = regex.matcher(rest); - if ( ! match.matches() ) { - if ( feature.getRef() != '*' ) - throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); - done = true; - } - else { - String g = match.group(1); - //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); - int l = Integer.parseInt(g); - i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) - //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); - } - break; - default: // non reference base - baseBuilder.append(c); - qualBuilder.append(quals.charAt(j++)); - } - } - - feature.setPileupBases(baseBuilder.toString()); - feature.setPileupQuals(qualBuilder.toString()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java deleted file mode 100644 index a6fd996fd..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java +++ /dev/null @@ -1,272 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.codecs.sampileup; - -import net.sf.samtools.util.StringUtil; -import org.broad.tribble.Feature; - -import java.util.List; - -/** - * A tribble feature representing a SAM pileup. - * - * @author mhanna - * @version 0.1 - */ -public class SAMPileupFeature implements Feature { - public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL }; - - private String contig; // genomic location of this genotyped site - private int start; - private int stop; - - private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) - private String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions - - private String pileupQuals; // the read base qualities - private String pileupBases; // the read bases themselves - - private List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion - private VariantType varType = VariantType.NONE; - private int nNonref = 0; // number of non-reference alleles observed - private int eventLength = 0; // number of inserted or deleted bases - - private double consensusScore = 0; - private double variantScore = 0; - - /** - * create the pileup feature. Default protection so that only other classes in this package can create it. - */ - SAMPileupFeature() {} - - public String getChr() { - return contig; - } - - protected void setChr(String chr) { - this.contig = chr; - } - - public int getStart() { - return start; - } - - protected void setStart(int start) { - this.start = start; - } - - public int getEnd() { - return stop; - } - - protected void setEnd(int end) { - this.stop = end; - } - - public String getQualsAsString() { return pileupQuals; } - - protected void setPileupQuals(String pileupQuals) { - this.pileupQuals = pileupQuals; - } - - /** Returns reference base for point genotypes or '*' for indel genotypes, as a char. - * - */ - public char getRef() { return refBaseChar; } - - protected void setRef(char ref) { - this.refBaseChar = ref; - } - - public int size() { return pileupQuals.length(); } - - /** Returns pile of observed bases over the current genomic location. - * - */ - public String getBasesAsString() { return pileupBases; } - - protected void setPileupBases(String pileupBases) { - this.pileupBases = pileupBases; - } - - /** Returns formatted pileup string for the current genomic location as - * "location: reference_base observed_base_pile observed_qual_pile" - */ - public String getPileupString() - { - if(start == stop) - return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString()); - else - return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString()); - } - - /** - * Gets the bases in byte array form. - * @return byte array of the available bases. - */ - public byte[] getBases() { - return StringUtil.stringToBytes(getBasesAsString()); - } - - /** - * Gets the Phred base qualities without ASCII offset. - * @return Phred base qualities. - */ - public byte[] getQuals() { - byte[] quals = StringUtil.stringToBytes(getQualsAsString()); - for(int i = 0; i < quals.length; i++) quals[i] -= 33; - return quals; - } - - /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single - * character (reference base). For indel genotypes, the string is empty for insertions into - * the reference, or consists of deleted bases for deletions. - * - * @return reference allele, forward strand - */ - public String getFWDRefBases() { - return refBases; - } - - protected void setRefBases(String refBases) { - this.refBases = refBases; - } - - public List getFWDAlleles() { - return observedAlleles; - } - - protected void setFWDAlleles(List alleles) { - this.observedAlleles = alleles; - } - - // ---------------------------------------------------------------------- - // - // What kind of variant are we? - // - // ---------------------------------------------------------------------- - public boolean isSNP() { return varType == VariantType.SNP; } - public boolean isInsertion() { return varType == VariantType.INSERTION; } - public boolean isDeletion() { return varType == VariantType.DELETION ; } - public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; } - public boolean isReference() { return varType == VariantType.NONE; } - - protected void setVariantType(VariantType variantType) { - this.varType = variantType; - } - - public boolean isHom() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) ); - return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); - } - - public boolean isHet() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) ); - return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); - } - - public double getVariantConfidence() { - return variantScore; - } - - protected void setVariantConfidence(double variantScore) { - this.variantScore = variantScore; - } - - public boolean isBiallelic() { - return nNonref < 2; - } - - protected void setNumNonRef(int nNonref) { - this.nNonref = nNonref; - } - - public double getConsensusConfidence() { - return consensusScore; - } - - protected void setConsensusConfidence(double consensusScore) { - this.consensusScore = consensusScore; - } - - public int length() { - return eventLength; - } - - protected void setLength(int eventLength) { - this.eventLength = eventLength; - } - - public boolean isIndelGenotype() { - return refBaseChar == '*'; - } - - - public boolean isPointGenotype() { - return ! isIndelGenotype(); - } - - /** Implements method required by GenotypeList interface. If this object represents - * an indel genotype, then it returns itself through this method. If this object is a - * point genotype, this method returns null. - * @return - */ - public SAMPileupFeature getIndelGenotype() { - if ( isIndelGenotype() ) return this; - else return null; - } - - /** Implements method required by GenotypeList interface. If this object represents - * a point genotype, then it returns itself through this method. If this object is an - * indel genotype, this method returns null. - * @return - */ - public SAMPileupFeature getPointGenotype() { - if ( isPointGenotype() ) return this; - else return null; - } - - /** Returns true if this object \em is an indel genotype (and thus - * indel genotype is what it only has). - * @return - */ - public boolean hasIndelGenotype() { - return isIndelGenotype(); - } - - /** Returns true if this object \em is a point genotype (and thus - * point genotype is what it only has. - * @return - */ - public boolean hasPointGenotype() { - return isPointGenotype(); - } - - - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java deleted file mode 100644 index 4d280423e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.exceptions; - -import java.lang.reflect.InvocationTargetException; - -/** - * Class for handling common failures of dynamic class resolution - * - * User: depristo - * Date: Sep 3, 2010 - * Time: 2:24:09 PM - */ -public class DynamicClassResolutionException extends UserException { - public DynamicClassResolutionException(Class c, Exception ex) { - super(String.format("Could not create module %s because %s caused by exception %s", - c.getSimpleName(), moreInfo(ex), ex.getMessage())); - } - - private static String moreInfo(Exception ex) { - try { - throw ex; - } catch (InstantiationException e) { - return "BUG: cannot instantiate class: must be concrete class"; - } catch (NoSuchMethodException e) { - return "BUG: Cannot find expected constructor for class"; - } catch (IllegalAccessException e) { - return "Cannot instantiate class (Illegal Access)"; - } catch (InvocationTargetException e) { - return "Cannot instantiate class (Invocation failure)"; - } catch ( Exception e ) { - return String.format("an exception of type %s occurred",e.getClass().getSimpleName()); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java deleted file mode 100644 index 40a730029..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ /dev/null @@ -1,489 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.exceptions; - -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.File; - -/** - * Represents the common user errors detected by Sting / GATK - * - * Root class for all GATK user errors, as well as the container for errors themselves - * - * User: depristo - * Date: Sep 3, 2010 - * Time: 2:24:09 PM - */ -@DocumentedGATKFeature( - groupName = HelpConstants.DOCS_CAT_USRERR, - summary = "Errors caused by incorrect user behavior, such as bad files, bad arguments, etc." ) -public class UserException extends ReviewedStingException { - /** - * The URL where people can get help messages. Printed when an error occurs - */ - public static final String PHONE_HOME_DOCS_URL = "http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest"; - - public UserException(String msg) { super(msg); } - public UserException(String msg, Throwable e) { super(msg, e); } - private UserException(Throwable e) { super("", e); } // cannot be called, private access - - protected static String getMessage(Throwable t) { - String message = t.getMessage(); - return message != null ? message : t.getClass().getName(); - } - - public static class CommandLineException extends UserException { - public CommandLineException(String message) { - super(String.format("Invalid command line: %s", message)); - } - } - - public static class MalformedReadFilterException extends CommandLineException { - public MalformedReadFilterException(String message) { - super(String.format("Malformed read filter: %s",message)); - } - } - - public static class IncompatibleReadFiltersException extends CommandLineException { - public IncompatibleReadFiltersException(final String filter1, final String filter2) { - super(String.format("Two read filters are enabled that are incompatible and cannot be used simultaneously: %s and %s", filter1, filter2)); - } - } - - public static class MalformedWalkerArgumentsException extends CommandLineException { - public MalformedWalkerArgumentsException(String message) { - super(String.format("Malformed walker argument: %s",message)); - } - } - - public static class UnsupportedCigarOperatorException extends UserException { - public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord read, final String message) { - super(String.format( - "Unsupported CIGAR operator %s in read %s at %s:%d. %s", - co, - read.getReadName(), - read.getReferenceName(), - read.getAlignmentStart(), - message)); - } - } - - - public static class MalformedGenomeLoc extends UserException { - public MalformedGenomeLoc(String message, GenomeLoc loc) { - super(String.format("Badly formed genome loc: %s: %s", message, loc)); - } - - public MalformedGenomeLoc(String message) { - super(String.format("Badly formed genome loc: %s", message)); - } - } - - public static class BadInput extends UserException { - public BadInput(String message) { - super(String.format("Bad input: %s", message)); - } - } - - // todo -- fix up exception cause passing - public static class MissingArgument extends CommandLineException { - public MissingArgument(String arg, String message) { - super(String.format("Argument %s was missing: %s", arg, message)); - } - } - - public static class BadArgumentValue extends CommandLineException { - public BadArgumentValue(String arg, String message) { - super(String.format("Argument %s has a bad value: %s", arg, message)); - } - } - - public static class UnknownTribbleType extends CommandLineException { - public UnknownTribbleType(String type, String message) { - super(String.format("Unknown tribble type %s: %s", type, message)); - } - } - - - public static class BadTmpDir extends UserException { - public BadTmpDir(String message) { - super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); - } - } - - public static class TooManyOpenFiles extends UserException { - public TooManyOpenFiles() { - super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); - } - } - - public static class LocalParallelizationProblem extends UserException { - public LocalParallelizationProblem(final File file) { - super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); - } - } - - public static class NotEnoughMemory extends UserException { - public NotEnoughMemory() { - super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); - } - } - - public static class ErrorWritingBamFile extends UserException { - public ErrorWritingBamFile(String message) { - super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); - } - } - - public static class NoSpaceOnDevice extends UserException { - public NoSpaceOnDevice() { - super("There is no space left on the device, so writing failed"); - } - } - - public static class CouldNotReadInputFile extends UserException { - public CouldNotReadInputFile(String message, Exception e) { - super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); - } - - public CouldNotReadInputFile(File file) { - super(String.format("Couldn't read file %s", file.getAbsolutePath())); - } - - public CouldNotReadInputFile(File file, String message) { - super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); - } - - public CouldNotReadInputFile(String file, String message) { - super(String.format("Couldn't read file %s because %s", file, message)); - } - - public CouldNotReadInputFile(File file, String message, Exception e) { - super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); - } - - public CouldNotReadInputFile(File file, Exception e) { - this(file, getMessage(e)); - } - - public CouldNotReadInputFile(String message) { - super(message); - } - } - - - public static class CouldNotCreateOutputFile extends UserException { - public CouldNotCreateOutputFile(File file, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); - } - - public CouldNotCreateOutputFile(File file, String message) { - super(String.format("Couldn't write file %s because %s", file.getAbsolutePath(), message)); - } - - public CouldNotCreateOutputFile(String filename, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", filename, message, getMessage(e))); - } - - public CouldNotCreateOutputFile(File file, Exception e) { - super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); - } - - public CouldNotCreateOutputFile(String message, Exception e) { - super(message, e); - } - } - - public static class MissortedBAM extends UserException { - public MissortedBAM(SAMFileHeader.SortOrder order, File file, SAMFileHeader header) { - super(String.format("Missorted Input SAM/BAM files: %s is must be sorted in %s order but order was: %s", file, order, header.getSortOrder())); - } - - public MissortedBAM(SAMFileHeader.SortOrder order, String message) { - super(String.format("Missorted Input SAM/BAM files: files are not sorted in %s order; %s", order, message)); - } - - public MissortedBAM(SAMFileHeader.SortOrder order, SAMRecord read, String message) { - super(String.format("Missorted Input SAM/BAM file %s: file sorted in %s order but %s is required; %s", - read.getFileSource().getReader(), read.getHeader().getSortOrder(), order, message)); - } - - public MissortedBAM(String message) { - super(String.format("Missorted Input SAM/BAM files: %s", message)); - } - } - - public static class MalformedBAM extends UserException { - public MalformedBAM(SAMRecord read, String message) { - this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); - } - - public MalformedBAM(File file, String message) { - this(file.toString(), message); - } - - public MalformedBAM(String source, String message) { - super(String.format("SAM/BAM file %s is malformed: %s", source, message)); - } - } - - public static class MisencodedBAM extends UserException { - public MisencodedBAM(SAMRecord read, String message) { - this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); - } - - public MisencodedBAM(String source, String message) { - super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message)); - } - } - - public static class MalformedVCF extends UserException { - public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); - } - - public MalformedVCF(String message) { - super(String.format("The provided VCF file is malformed: %s", message)); - } - - public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); - } - } - - public static class MalformedBCF2 extends UserException { - public MalformedBCF2( String message ) { - super(String.format("Malformed BCF2 file: %s", message)); - } - } - - public static class MalformedVCFHeader extends UserException { - public MalformedVCFHeader(String message) { - super(String.format("The provided VCF file has a malformed header: %s", message)); - } - } - - public static class ReadMissingReadGroup extends MalformedBAM { - public ReadMissingReadGroup(final SAMRecord read) { - super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); - } - } - - public static class ReadHasUndefinedReadGroup extends MalformedBAM { - public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { - super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); - } - } - - public static class VariantContextMissingRequiredField extends UserException { - public VariantContextMissingRequiredField(String field, VariantContext vc) { - super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); - } - } - - public static class MissortedFile extends UserException { - public MissortedFile(File file, String message, Exception e) { - super(String.format("Missorted Input file: %s is must be sorted in coordinate order. %s and got error %s", file, message, getMessage(e))); - } - } - - public static class FailsStrictValidation extends UserException { - public FailsStrictValidation(File f, String message) { - super(String.format("File %s fails strict validation: %s", f.getAbsolutePath(), message)); - } - } - - public static class MalformedFile extends UserException { - public MalformedFile(String message) { - super(String.format("Unknown file is malformed: %s", message)); - } - - public MalformedFile(String message, Exception e) { - super(String.format("Unknown file is malformed: %s caused by %s", message, getMessage(e))); - } - - public MalformedFile(File f, String message) { - super(String.format("File %s is malformed: %s", f.getAbsolutePath(), message)); - } - - public MalformedFile(File f, String message, Exception e) { - super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, getMessage(e))); - } - - public MalformedFile(String name, String message) { - super(String.format("File associated with name %s is malformed: %s", name, message)); - } - - public MalformedFile(String name, String message, Exception e) { - super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, getMessage(e))); - } - } - - public static class CannotExecuteRScript extends UserException { - public CannotExecuteRScript(String message) { - super(String.format("Unable to execute RScript command: " + message)); - } - public CannotExecuteRScript(String message, Exception e) { - super(String.format("Unable to execute RScript command: " + message), e); - } - } - - public static class DeprecatedArgument extends CommandLineException { - public DeprecatedArgument(String param, String doc) { - super(String.format("The parameter %s is deprecated. %s",param,doc)); - } - } - - - public static class IncompatibleSequenceDictionaries extends UserException { - public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { - super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", - name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); - } - } - - public static class LexicographicallySortedSequenceDictionary extends UserException { - public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { - super(String.format("Lexicographically sorted human genome sequence detected in %s." - + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." - + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") - + "\n %s contigs = %s", - name, name, ReadUtils.prettyPrintSequenceRecords(dict))); - } - } - - public static class DeprecatedWalker extends UserException { - public DeprecatedWalker(String walkerName, String version) { - super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); - } - } - - public static class DeprecatedAnnotation extends UserException { - public DeprecatedAnnotation(String annotationName, String version) { - super(String.format("Annotation %s is no longer available in the GATK; it has been deprecated since version %s", annotationName, version)); - } - } - - public static class CannotExecuteQScript extends UserException { - public CannotExecuteQScript(String message) { - super(String.format("Unable to execute QScript: " + message)); - } - public CannotExecuteQScript(String message, Exception e) { - super(String.format("Unable to execute QScript: " + message), e); - } - } - - public static class CannotHandleGzippedRef extends UserException { - public CannotHandleGzippedRef() { - super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); - } - } - - public static class MissingReferenceFaiFile extends UserException { - public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) { - super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.", - indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(), - HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); - } - } - - public static class MissingReferenceDictFile extends UserException { - public MissingReferenceDictFile( final File dictFile, final File fastaFile ) { - super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.", - dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(), - HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); - } - } - - public static class UnreadableKeyException extends UserException { - public UnreadableKeyException ( File f, Exception e ) { - super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " + - "Please see %s for help.", - f.getAbsolutePath(), getMessage(e), PHONE_HOME_DOCS_URL)); - } - - public UnreadableKeyException ( String message, Exception e ) { - this(String.format("%s. Error was: %s", message, getMessage(e))); - } - - public UnreadableKeyException ( String message ) { - super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " + - "Please see %s for help.", - message, PHONE_HOME_DOCS_URL)); - } - } - - public static class KeySignatureVerificationException extends UserException { - public KeySignatureVerificationException ( File f ) { - super(String.format("The signature in key file %s failed cryptographic verification. " + - "If this key was valid in the past, it's likely been revoked. " + - "Please see %s for help.", - f.getAbsolutePath(), PHONE_HOME_DOCS_URL)); - } - } - - public static class GVCFIndexException extends UserException { - public GVCFIndexException (GATKVCFIndexType indexType, int indexParameter) { - super(String.format("GVCF output requires a specific indexing strategy. Please re-run including the arguments " + - "-variant_index_type %s -variant_index_parameter %d.", - indexType, indexParameter)); - } - } - - /** - * A special exception that happens only in the case where - * the filesystem, by design or configuration, is completely unable - * to handle locking. This exception will specifically NOT be thrown - * in the case where the filesystem handles locking but is unable to - * acquire a lock due to concurrency. - */ - public static class FileSystemInabilityToLockException extends UserException { - public FileSystemInabilityToLockException( String message ) { - super(message); - } - - public FileSystemInabilityToLockException( String message, Exception innerException ) { - super(message,innerException); - } - } - - public static class IncompatibleRecalibrationTableParameters extends UserException { - public IncompatibleRecalibrationTableParameters(String s) { - super(s); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java deleted file mode 100644 index b7cd03919..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ /dev/null @@ -1,390 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.genotyper; - - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.*; - -/** - * Wrapper class that holds a set of maps of the form (Read -> Map(Allele->Double)) - * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. - */ -public class PerReadAlleleLikelihoodMap { - /** A set of all of the allele, so we can efficiently determine if an allele is already present */ - private final Set allelesSet = new HashSet<>(); - /** A list of the unique allele, as an ArrayList so we can call get(i) efficiently */ - protected final List alleles = new ArrayList<>(); - protected final Map> likelihoodReadMap = new LinkedHashMap<>(); - - public PerReadAlleleLikelihoodMap() { } - - /** - * Add a new entry into the Read -> ( Allele -> Likelihood ) map of maps. - * @param read - the GATKSAMRecord that was evaluated - * @param a - the Allele against which the GATKSAMRecord was evaluated - * @param likelihood - the likelihood score resulting from the evaluation of "read" against "a" - */ - public void add(final GATKSAMRecord read, final Allele a, final Double likelihood) { - if ( read == null ) throw new IllegalArgumentException("Cannot add a null read to the allele likelihood map"); - if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map"); - if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null"); - if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))"); - - Map likelihoodMap = likelihoodReadMap.get(read); - if (likelihoodMap == null){ - // LinkedHashMap will ensure iterating through alleles will be in consistent order - likelihoodMap = new LinkedHashMap<>(); - likelihoodReadMap.put(read,likelihoodMap); - } - - likelihoodMap.put(a,likelihood); - - if (!allelesSet.contains(a)) { - allelesSet.add(a); - alleles.add(a); - } - } - - public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { - return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); - } - - /** - * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion - * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination - * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. - */ - public void performPerAlleleDownsampling(final double downsamplingFraction) { - // special case removal of all or no reads - if ( downsamplingFraction <= 0.0 ) - return; - if ( downsamplingFraction >= 1.0 ) { - likelihoodReadMap.clear(); - return; - } - - // start by stratifying the reads by the alleles they represent at this position - final Map> alleleReadMap = getAlleleStratifiedReadMap(); - - // compute the reads to remove and actually remove them - final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); - for ( final GATKSAMRecord read : readsToRemove ) - likelihoodReadMap.remove(read); - } - - /** - * Convert the @likelihoodReadMap to a map of alleles to reads, where each read is mapped uniquely to the allele - * for which it has the greatest associated likelihood - * @return a map from each allele to a list of reads that 'support' the allele - */ - protected Map> getAlleleStratifiedReadMap() { - final Map> alleleReadMap = new HashMap<>(alleles.size()); - for ( final Allele allele : alleles ) - alleleReadMap.put(allele, new ArrayList()); - - for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // TODO -- come up with a strategy for down-sampling reduced reads - // Currently we are unable to remove reduced reads because their representative base count differs throughout the read - if ( !entry.getKey().isReducedRead() ) { - final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); - if ( bestAllele.isInformative() ) - alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); - } - } - - return alleleReadMap; - } - - @Ensures("result >=0") - public int size() { - return likelihoodReadMap.size(); - } - - /** - * Helper function to add the read underneath a pileup element to the map - * @param p Pileup element - * @param a Corresponding allele - * @param likelihood Allele likelihood - */ - public void add(PileupElement p, Allele a, Double likelihood) { - if (p==null) - throw new IllegalArgumentException("Pileup element cannot be null"); - if ( p.getRead()==null ) - throw new IllegalArgumentException("Read underlying pileup element cannot be null"); - if ( a == null ) - throw new IllegalArgumentException("Allele for add() cannot be null"); - - add(p.getRead(), a, likelihood); - } - - /** - * Does the current map contain the key associated with a particular SAM record in pileup? - * @param p Pileup element - * @return true if the map contains pileup element, else false - */ - public boolean containsPileupElement(final PileupElement p) { - return likelihoodReadMap.containsKey(p.getRead()); - } - - public boolean isEmpty() { - return likelihoodReadMap.isEmpty(); - } - - public Map> getLikelihoodReadMap() { - return likelihoodReadMap; - } - - public void clear() { - allelesSet.clear(); - alleles.clear(); - likelihoodReadMap.clear(); - } - - public Set getStoredElements() { - return likelihoodReadMap.keySet(); - } - -// public Collection> getLikelihoodMapValues() { -// return likelihoodReadMap.values(); -// } - - public int getNumberOfStoredElements() { - return likelihoodReadMap.size(); - } - - public Map getLikelihoodsAssociatedWithPileupElement(final PileupElement p) { - if (!likelihoodReadMap.containsKey(p.getRead())) - return null; - - return likelihoodReadMap.get(p.getRead()); - } - - - /** - * Get the log10 likelihood associated with an individual read/allele - * - * @param read the read whose likelihood we want - * @param allele the allele whose likelihood we want - * @return the log10 likelihood that this read matches this allele - */ - public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ - if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read)) - return 0.0; - - return likelihoodReadMap.get(read).get(allele); - } - - /** - * Get the most likely alleles estimated across all reads in this object - * - * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for - * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum - * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. - * - * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele - * doesn't have a meaningful get best likelihood. - * - * @return a MostLikelyAllele object, or null if this map is empty - */ - public MostLikelyAllele getMostLikelyDiploidAlleles() { - if ( isEmpty() ) return null; - - int hap1 = 0; - int hap2 = 0; - double maxElement = Double.NEGATIVE_INFINITY; - for( int iii = 0; iii < alleles.size(); iii++ ) { - final Allele iii_allele = alleles.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Allele jjj_allele = alleles.get(jjj); - - double haplotypeLikelihood = 0.0; - for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - final GATKSAMRecord read = entry.getKey(); - final int count = ReadUtils.getMeanRepresentativeReadCount(read); - final double likelihood_iii = entry.getValue().get(iii_allele); - final double likelihood_jjj = entry.getValue().get(jjj_allele); - haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF); - - // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair - if ( haplotypeLikelihood < maxElement ) break; - } - - // keep track of the max element and associated indices - if ( haplotypeLikelihood > maxElement ) { - hap1 = iii; - hap2 = jjj; - maxElement = haplotypeLikelihood; - } - } - } - - if ( maxElement == Double.NEGATIVE_INFINITY ) - throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); - - return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); - } - - /** - * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * - * @param alleleMap - a map from alleles to likelihoods - * @return - a MostLikelyAllele object - */ - @Ensures("result != null") - public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { - return getMostLikelyAllele(alleleMap, null); - } - - /** - * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * - * @param alleleMap - a map from alleles to likelihoods - * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. - * this is useful for the case where you've selected a subset of the alleles that - * the reads have been computed for further analysis. If null totally ignored - * @return - a MostLikelyAllele object - */ - public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { - if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); - double maxLike = Double.NEGATIVE_INFINITY; - double prevMaxLike = Double.NEGATIVE_INFINITY; - Allele mostLikelyAllele = Allele.NO_CALL; - Allele secondMostLikely = null; - - for (final Map.Entry el : alleleMap.entrySet()) { - if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) - continue; - - if (el.getValue() > maxLike) { - prevMaxLike = maxLike; - maxLike = el.getValue(); - secondMostLikely = mostLikelyAllele; - mostLikelyAllele = el.getKey(); - } else if( el.getValue() > prevMaxLike ) { - secondMostLikely = el.getKey(); - prevMaxLike = el.getValue(); - } - } - - return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); - } - - /** - * Debug method to dump contents of object into string for display - */ - public String toString() { - final StringBuilder sb = new StringBuilder(); - - sb.append("Alelles in map:"); - for (final Allele a:alleles) { - sb.append(a.getDisplayString()+","); - } - sb.append("\n"); - for (final Map.Entry > el : getLikelihoodReadMap().entrySet() ) { - for (final Map.Entry eli : el.getValue().entrySet()) { - sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n"); - } - - } - return sb.toString(); - } - - /** - * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods - * - * Goes through each read in this map, and if it is poorly modelled removes it from the map. - * - * @see #readIsPoorlyModelled(org.broadinstitute.sting.utils.sam.GATKSAMRecord, java.util.Collection, double) - * for more information about the poorly modelled test. - * - * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled - * @return the list of reads removed from this map because they are poorly modelled - */ - public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { - final List removedReads = new LinkedList<>(); - final Iterator>> it = likelihoodReadMap.entrySet().iterator(); - while ( it.hasNext() ) { - final Map.Entry> record = it.next(); - if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { - it.remove(); - removedReads.add(record.getKey()); - } - } - - return removedReads; - } - - /** - * Is this read poorly modelled by all of the alleles in this map? - * - * A read is poorly modeled when it's likelihood is below what would be expected for a read - * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. - * - * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood - * of the read. In other words, that the read would be fully explained by one of the alleles. This means - * that the allele should be something like the full haplotype from which the read might originate. - * - * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So - * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors - * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect - * a likelihood to be >= 10 * -3. - * - * @param read the read we want to evaluate - * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. - * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So - * 0.01 means a 1% error rate - * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes - */ - protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { - final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); - final double log10QualPerBase = -4.0; - final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; - - for ( final double log10Likelihood : log10Likelihoods ) - if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) - return false; - - return true; - } - - /** - * Get an unmodifiable set of the unique alleles in this PerReadAlleleLikelihoodMap - * @return a non-null unmodifiable map - */ - public Set getAllelesSet() { - return Collections.unmodifiableSet(allelesSet); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java deleted file mode 100644 index 0390e32d7..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java +++ /dev/null @@ -1,48 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import java.lang.annotation.*; - -/** - * An annotation to identify a class as a GATK capability for documentation - * - * @author depristo - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface DocumentedGATKFeature { - /** Should we actually document this feature, even through it's annotated? */ - public boolean enable() default true; - /** The overall group name (walkers, readfilters) this feature is associated with */ - public String groupName(); - /** A human readable summary of the purpose of this group of features */ - public String summary() default ""; - /** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */ - public Class[] extraDocs() default {}; -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java deleted file mode 100644 index 7d6819f39..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java +++ /dev/null @@ -1,59 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -/** - * Documentation unit. Effectively a class version of the DocumentedGATKFeature. - * Immutable data structure. - * - * @author depristo - */ -class DocumentedGATKFeatureObject { - /** Which class are we documenting. Specific to each class being documented */ - private final Class classToDoc; - /** Are we enabled? */ - private final boolean enable; - private final String groupName, summary; - private final Class[] extraDocs; - - public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs) { - this.classToDoc = classToDoc; - this.enable = enable; - this.groupName = groupName; - this.summary = summary; - this.extraDocs = extraDocs; - } - - public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary) { - this(classToDoc, true, groupName, summary, new Class[]{}); - } - - public Class getClassToDoc() { return classToDoc; } - public boolean enable() { return enable; } - public String groupName() { return groupName; } - public String summary() { return summary; } - public Class[] extraDocs() { return extraDocs; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java deleted file mode 100644 index 63cb0900a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java +++ /dev/null @@ -1,519 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import com.sun.javadoc.ClassDoc; -import com.sun.javadoc.RootDoc; -import freemarker.template.Configuration; -import freemarker.template.DefaultObjectWrapper; -import freemarker.template.Template; -import freemarker.template.TemplateException; -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.broad.tribble.FeatureCodec; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.*; -import java.util.*; - -/** - * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker - * templates to produce html formatted GATKDocs for walkers - * and other classes. - *

- * This document has the following workflow: - *

- * 1 -- walk the javadoc hierarchy, looking for class that have the - * DocumentedGATKFeature annotation or are in the type hierarchy in the - * static list of things to document, and are to be documented - * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete - * set of things to document - * 3 -- for each unit, actually generate an html page documenting it - * as well as links to related features via their units. Writing - * of a specific class HTML is accomplished by a generate DocumentationHandler - * 4 -- write out an index of all units, organized by group - *

- * The documented classes are restricted to only those with @DocumentedGATKFeature - * annotation or are in the STATIC_DOCS class. - */ -public class GATKDoclet { - final protected static Logger logger = Logger.getLogger(GATKDoclet.class); - - /** - * Where we find the help FreeMarker templates - */ - final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); - - /** - * Where we write the GATKDoc html directory - */ - final protected static File DESTINATION_DIR = new File("gatkdocs"); - - final private static String FORUM_KEY_FILE = "/local/gsa-engineering/gatkdocs_publisher/forum.key"; - // ---------------------------------------------------------------------- - // - // Global variables that are set on the command line by javadoc - // - // ---------------------------------------------------------------------- - protected static String buildTimestamp = null, absoluteVersion = null; - protected static boolean showHiddenFeatures = false; - - protected static boolean testOnly = false; - - /** - * Any class that's in this list will be included in the documentation - * when the -test argument is provided. Useful for debugging. - */ - private static final List> testOnlyKeepers = Arrays.asList( - DocumentationTest.class, CommandLineGATK.class, UserException.class); - - /** - * The javadoc root doc - */ - RootDoc rootDoc; - - /** - * The set of all things we are going to document - */ - Set myWorkUnits; - - /** - * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends - * one of the DocumentedGATKFeatureObjects.clazz of this collection will also - * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful - * when you want to document things that implement an interface (annotations on java - * interfaces aren't inherited) or whose base class isn't under your control (tribble - * codecs). - */ - final static Collection STATIC_DOCS = new ArrayList(); - - static { - STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, - HelpConstants.DOCS_CAT_RODCODECS, - "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED")); - } - - - /** - * Extracts the contents of certain types of javadoc and adds them to an XML file. - * - * @param rootDoc The documentation root. - * @return Whether the JavaDoc run succeeded. - * @throws java.io.IOException if output can't be written. - */ - public static boolean start(RootDoc rootDoc) throws IOException { - logger.setLevel(Level.INFO); - - // load arguments - for (String[] options : rootDoc.options()) { - if (options[0].equals("-build-timestamp")) - buildTimestamp = options[1]; - if (options[0].equals("-absolute-version")) - absoluteVersion = options[1]; - if (options[0].equals("-include -hidden")) - showHiddenFeatures = true; - if (options[0].equals("-test")) - testOnly = true; - } - - // process the docs - new GATKDoclet().processDocs(rootDoc); - - - return true; - } - - /** - * Validate the given options against options supported by this doclet. - * - * @param option Option to validate. - * @return Number of potential parameters; 0 if not supported. - */ - public static int optionLength(String option) { - if (option.equals("-build-timestamp") || - option.equals("-absolute-version") || - option.equals("-include-hidden")) { - return 2; - } else if (option.equals("-test")) - return 1; - else - return 0; - } - - /** - * Are we supposed to include @Hidden annotations in our documented output? - * - * @return - */ - public boolean showHiddenFeatures() { - return showHiddenFeatures; - } - - /** - * @param rootDoc - */ - private void processDocs(RootDoc rootDoc) { - // setup the global access to the root - this.rootDoc = rootDoc; - - try { - // basic setup - DESTINATION_DIR.mkdirs(); - FileUtils.copyFile(new File(SETTINGS_DIR + "/bootstrap.min.css"), new File(DESTINATION_DIR + "/bootstrap.min.css")); - FileUtils.copyFile(new File(SETTINGS_DIR + "/bootstrap.min.js"), new File(DESTINATION_DIR + "/bootstrap.min.js")); - FileUtils.copyFile(new File(SETTINGS_DIR + "/jquery.min.js"), new File(DESTINATION_DIR + "/jquery.min.js")); - // print the Version number - FileUtils.writeByteArrayToFile(new File(DESTINATION_DIR + "/current.version.txt"), getSimpleVersion(absoluteVersion).getBytes()); - - /* ------------------------------------------------------------------- */ - /* You should do this ONLY ONCE in the whole application life-cycle: */ - - Configuration cfg = new Configuration(); - // Specify the data source where the template files come from. - cfg.setDirectoryForTemplateLoading(SETTINGS_DIR); - // Specify how templates will see the data-model. This is an advanced topic... - cfg.setObjectWrapper(new DefaultObjectWrapper()); - - myWorkUnits = computeWorkUnits(); - - List> groups = new ArrayList>(); - Set seenDocumentationFeatures = new HashSet(); - List> data = new ArrayList>(); - for (GATKDocWorkUnit workUnit : myWorkUnits) { - data.add(workUnit.indexDataMap()); - if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { - groups.add(toMap(workUnit.annotation)); - seenDocumentationFeatures.add(workUnit.annotation.groupName()); - } - } - - for (GATKDocWorkUnit workUnit : myWorkUnits) { - processDocWorkUnit(cfg, workUnit, groups, data); - } - - processIndex(cfg, new ArrayList(myWorkUnits)); - - File forumKeyFile = new File(FORUM_KEY_FILE); - if (forumKeyFile.exists()) { - String forumKey = null; - // Read in a one-line file so we can do a for loop - for (String line : new XReadLines(forumKeyFile)) - forumKey = line; - updateForum(myWorkUnits, forumKey); - } - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private void updateForum(Set docWorkUnits, String forumKey) { - //first get list of posts that need to be added - List old = ForumAPIUtils.getPostedTools(forumKey); - - for (String s : old) - System.out.println(s); - - System.out.printf("Forum has %d items%n", old.size()); - System.out.printf("Docs have %d items%n", docWorkUnits.size()); - - List toAdd = new ArrayList(); - for (GATKDocWorkUnit tool : docWorkUnits) { - if (!old.contains(tool.name)) { - System.out.println("WILL POST: " + tool.name + " TO FORUM"); - toAdd.add(tool); - } - } - - //update using list - for (GATKDocWorkUnit tool : toAdd) { - //if ( tool.name.equals("ApplyRecalibration") ) - ForumAPIUtils.postToForum(tool, forumKey); - } - } - - /** - * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. - * - * @return - */ - private Set computeWorkUnits() { - TreeSet m = new TreeSet(); - - for (ClassDoc doc : rootDoc.classes()) { - //logger.debug("Considering " + doc); - Class clazz = getClassForClassDoc(doc); - - // don't add anything that's not DocumentationTest if we are in test mode - if (clazz != null && testOnly && !testOnlyKeepers.contains(clazz)) - continue; - - //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) - // logger.debug("foo"); - - DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); - DocumentedGATKFeatureHandler handler = createHandler(doc, feature); - if (handler != null && handler.includeInDocs(doc)) { - //logger.info("Generating documentation for class " + doc); - String filename = handler.getDestinationFilename(doc, clazz); - GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), - filename, feature.groupName(), feature, handler, doc, clazz, - buildTimestamp, absoluteVersion); - m.add(unit); - } - } - - return m; - } - - /** - * Create a handler capable of documenting the class doc according to feature. Returns - * null if no appropriate handler is found or doc shouldn't be documented at all. - * - * @param doc - * @param feature - * @return - */ - private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { - if (feature != null) { - if (feature.enable()) { - DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); - handler.setDoclet(this); - return handler; - } else { - logger.info("Skipping disabled Documentation for " + doc); - } - } - - return null; - } - - /** - * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc - * structure we will apply to Doc. - * - * @param doc - * @return null if this proves inappropriate or doc shouldn't be documented - */ - private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { - Class docClass = getClassForClassDoc(doc); - - if (docClass == null) - return null; // not annotated so it shouldn't be documented - - if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) { - DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); - return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs()); - } else { - for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) { - if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) { - return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs()); - } - } - return null; - } - } - - /** - * Return the Java class described by the ClassDoc doc - * - * @param doc - * @return - */ - private Class getClassForClassDoc(ClassDoc doc) { - try { - // todo -- what do I need the ? extends Object to pass the compiler? - return (Class) DocletUtils.getClassForDoc(doc); - } catch (ClassNotFoundException e) { - //logger.warn("Couldn't find class for ClassDoc " + doc); - // we got a classdoc for a class we can't find. Maybe in a library or something - return null; - } catch (NoClassDefFoundError e) { - return null; - } catch (UnsatisfiedLinkError e) { - return null; // naughty BWA bindings - } - } - - /** - * Create the html index listing all of the GATKDocs features - * - * @param cfg - * @param indexData - * @throws IOException - */ - private void processIndex(Configuration cfg, List indexData) throws IOException { - /* Get or create a template */ - Template temp = cfg.getTemplate("generic.index.template.html"); - - /* Merge data-model with template */ - Writer out = new OutputStreamWriter(new FileOutputStream(new File(DESTINATION_DIR + "/index.html"))); - try { - temp.process(groupIndexData(indexData), out); - out.flush(); - } catch (TemplateException e) { - throw new ReviewedStingException("Failed to create GATK documentation", e); - } - } - - /** - * Helpful function to create the html index. Given all of the already run GATKDocWorkUnits, - * create the high-level grouping data listing individual features by group. - * - * @param indexData - * @return - */ - private Map groupIndexData(List indexData) { - // - // root -> data -> { summary -> y, filename -> z }, etc - // -> groups -> group1, group2, etc. - Map root = new HashMap(); - - Collections.sort(indexData); - - List> groups = new ArrayList>(); - Set seenDocumentationFeatures = new HashSet(); - List> data = new ArrayList>(); - for (GATKDocWorkUnit workUnit : indexData) { - data.add(workUnit.indexDataMap()); - if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { - groups.add(toMap(workUnit.annotation)); - seenDocumentationFeatures.add(workUnit.annotation.groupName()); - } - } - - //System.out.printf(groups.toString()); - - root.put("data", data); - root.put("groups", groups); - root.put("timestamp", buildTimestamp); - root.put("version", absoluteVersion); - - return root; - } - - /** - * Trivial helper routine that returns the map of name and summary given the annotation - * AND adds a super-category so that we can custom-order the categories in the index - * - * @param annotation - * @return - */ - private static final Map toMap(DocumentedGATKFeatureObject annotation) { - Map root = new HashMap(); - root.put("id", annotation.groupName().replaceAll("\\W", "")); - root.put("name", annotation.groupName()); - root.put("summary", annotation.summary()); - - /** - * Add-on super-category definitions. The assignments depend on parsing the names - * defined in HelpConstants.java so be careful of changing anything. - * Also, the super-category value strings need to be the same as used in the - * Freemarker template. This is all fairly clunky but the best I could do without - * making major changes to the DocumentedGATKFeatureObject. Doesn't help that - * Freemarker makes any scripting horribly awkward. - */ - final String supercatValue; - if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools"; - else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities"; - else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine"; - else supercatValue = "other"; - - root.put("supercat", supercatValue); - - return root; - } - - /** - * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units - * - * @param c the class we are looking for - * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found - */ - public final GATKDocWorkUnit findWorkUnitForClass(Class c) { - for (final GATKDocWorkUnit unit : this.myWorkUnits) - if (unit.clazz.equals(c)) - return unit; - return null; - } - - /** - * Return the ClassDoc associated with clazz - * - * @param clazz - * @return - */ - public ClassDoc getClassDocForClass(Class clazz) { - return rootDoc.classNamed(clazz.getName()); - } - - /** - * High-level function that processes a single DocWorkUnit unit using its handler - * - * @param cfg - * @param unit - * @param data - * @throws IOException - */ - private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, List> groups, List> data) - throws IOException { - //System.out.printf("Processing documentation for class %s%n", unit.classDoc); - - unit.handler.processOne(unit); - unit.forTemplate.put("groups", groups); - unit.forTemplate.put("data", data); - // Get or create a template - Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); - - // Merge data-model with template - File outputPath = new File(DESTINATION_DIR + "/" + unit.filename); - try { - Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); - temp.process(unit.forTemplate, out); - out.flush(); - } catch (TemplateException e) { - throw new ReviewedStingException("Failed to create GATK documentation", e); - } - } - - private static String getSimpleVersion(String absoluteVersion) { - String[] parts = absoluteVersion.split("-"); - - // by skipping i=0, there is no trailing separator - for (int i = 1; i < 2; i++) { - parts[0] = parts[0].concat("-"); - parts[0] = parts[0].concat(parts[i]); - } - - return parts[0]; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java deleted file mode 100644 index 893a8349b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ /dev/null @@ -1,920 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import com.sun.javadoc.ClassDoc; -import com.sun.javadoc.FieldDoc; -import com.sun.javadoc.Tag; -import org.apache.commons.lang.StringUtils; -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.IOException; -import java.lang.annotation.Annotation; -import java.lang.reflect.*; -import java.util.*; - -/** - * - */ -public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { - private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); - - /** - * The max. length of the longest of --fullName -shortName argument name - * before we prefer the shorter option. - */ - private static final int MAX_DISPLAY_NAME = 30; - - /** - * The Class we are documenting - */ - private GATKDocWorkUnit toProcess; - - @Override - public boolean includeInDocs(ClassDoc doc) { - try { - Class type = DocletUtils.getClassForDoc(doc); - boolean hidden = !getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class); - return !hidden && JVMUtils.isConcrete(type); - } catch (ClassNotFoundException e) { - return false; - } - } - - - @Override - public String getTemplateName(ClassDoc doc) throws IOException { - return "generic.template.html"; - } - - @Override - public void processOne(GATKDocWorkUnit toProcessArg) { - this.toProcess = toProcessArg; - - //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); - Map root = new HashMap(); - - addHighLevelBindings(root); - addArgumentBindings(root); - addRelatedBindings(root); - root.put("group", toProcess.group); - - // Adding in retrieval of peripheral info (rf annotations etc) - getClazzAnnotations(toProcess.clazz, root); - - toProcess.setHandlerContent((String) root.get("summary"), root); - } - - /** - * Add high-level summary information about toProcess to root, such as its - * name, summary, description, version, etc. - * - * @param root - */ - protected void addHighLevelBindings(Map root) { - root.put("name", toProcess.classDoc.name()); - - // Extract overrides from the doc tags. - StringBuilder summaryBuilder = new StringBuilder(); - for (Tag tag : toProcess.classDoc.firstSentenceTags()) - summaryBuilder.append(tag.text()); - root.put("summary", summaryBuilder.toString()); - root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); - root.put("timestamp", toProcess.buildTimestamp); - root.put("version", toProcess.absoluteVersion); - - for (Tag tag : toProcess.classDoc.tags()) { - root.put(tag.name(), tag.text()); - } - } - - /** - * Add bindings describing related GATK capabilites to toProcess - * - * @param root - */ - protected void addRelatedBindings(Map root) { - List> extraDocsData = new ArrayList>(); - - // add in all of the explicitly related items - for (final Class extraDocClass : toProcess.annotation.extraDocs()) { - final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); - if (otherUnit == null) - throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); - extraDocsData.add( - new HashMap() {{ - put("filename", otherUnit.filename); - put("name", otherUnit.name); - }}); - } - root.put("extradocs", extraDocsData); - } - - /** - * Add information about all of the arguments available to toProcess to root - * - * @param root - */ - protected void addArgumentBindings(Map root) { - ParsingEngine parsingEngine = createStandardGATKParsingEngine(); - - Map>> args = createArgumentMap(); - root.put("arguments", args); - try { - // loop over all of the arguments according to the parsing engine - for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) { - // todo -- why can you have multiple ones? - ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); - FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); - Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); - if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) { - final String kind = docKindOfArg(argumentSource); - - final Object value = argumentValue(toProcess.clazz, argumentSource); - if (value != null) - argBindings.put("defaultValue", prettyPrintValueString(value)); - - args.get(kind).add(argBindings); - args.get("all").add(argBindings); - } - } - - // sort the arguments - for (Map.Entry>> entry : args.entrySet()) { - entry.setValue(sortArguments(entry.getValue())); - } - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } - } - - /** - * Return the argument kind (required, advanced, hidden, etc) of this argumentSource - * - * @param argumentSource - * @return - */ - @Requires("argumentSource != null") - @Ensures("result != null") - private String docKindOfArg(ArgumentSource argumentSource) { - if (argumentSource.isRequired()) { - if (argumentSource.isInput()) return "required_in"; - else if (argumentSource.isOutput()) return "required_out"; - else if (argumentSource.isFlag()) return "required_flag"; - else return "required_param"; - } - else if (argumentSource.isAdvanced()) { - if (argumentSource.isInput()) return "advanced_in"; - else if (argumentSource.isOutput()) return "advanced_out"; - else if (argumentSource.isFlag()) return "advanced_flag"; - else return "advanced_param"; - } - else if (argumentSource.isHidden()) return "hidden"; - else if (argumentSource.isDeprecated()) return "deprecated"; - else { - if (argumentSource.isInput()) return "optional_in"; - else if (argumentSource.isOutput()) return "optional_out"; - else if (argumentSource.isFlag()) return "optional_flag"; - else return "optional_param"; - } - } - - /** - * Attempts to determine the value of argumentSource in an instantiated version of c - * - * @param c - * @param argumentSource - * @return value of argumentSource, or null if this isn't possible - */ - @Requires({"c != null", "argumentSource != null"}) - private Object argumentValue(Class c, ArgumentSource argumentSource) { - // get the value of the field - // attempt to instantiate the class - final Object instance = makeInstanceIfPossible(toProcess.clazz); - if (instance != null) { - final Object value = getFieldValue(instance, argumentSource.field.getName()); - if (value != null) - return value; - - if (argumentSource.createsTypeDefault()) { - try { // handle the case where there's an implicit default - return argumentSource.typeDefaultDocString(); - } catch (ReviewedStingException e) { - ; // failed to create type default, don't worry about it - } - } - } - - return null; - } - - /** - * Create the argument map for holding class arguments - * - * @return - */ - private Map>> createArgumentMap() { - Map>> args = new HashMap>>(); - args.put("all", new ArrayList>()); - args.put("required_in", new ArrayList>()); - args.put("required_out", new ArrayList>()); - args.put("required_param", new ArrayList>()); - args.put("required_flag", new ArrayList>()); - args.put("optional_in", new ArrayList>()); - args.put("optional_out", new ArrayList>()); - args.put("optional_param", new ArrayList>()); - args.put("optional_flag", new ArrayList>()); - args.put("advanced_in", new ArrayList>()); - args.put("advanced_out", new ArrayList>()); - args.put("advanced_param", new ArrayList>()); - args.put("advanced_flag", new ArrayList>()); - args.put("hidden", new ArrayList>()); - args.put("deprecated", new ArrayList>()); - return args; - } - - - /** - * Sorts the individual argument list in unsorted according to CompareArgumentsByName - * - * @param unsorted - * @return - */ - private List> sortArguments(List> unsorted) { - Collections.sort(unsorted, new CompareArgumentsByName()); - return unsorted; - } - - /** - * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes - */ - private class CompareArgumentsByName implements Comparator> { - public int compare(Map x, Map y) { - return elt(x).compareTo(elt(y)); - } - - private String elt(Map m) { - String v = m.get("name").toString().toLowerCase(); - if (v.startsWith("--")) - return v.substring(2); - else if (v.startsWith("-")) - return v.substring(1); - else - throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); - } - } - - /** - * Umbrella function that groups the collection of values for specific annotations applied to an - * instance of class c. Lists of collected values are added directly to the "toProcess" object. - * Requires being able to instantiate the class. - * - * @param classToProcess the object to instantiate and query for the annotation - * @param root the root of the document handler, to which we'll store collected annotations - */ - private void getClazzAnnotations(Class classToProcess, Map root) { - // - // attempt to instantiate the class - final Object instance = makeInstanceIfPossible(classToProcess); - if (instance != null) { - final Class myClass = instance.getClass(); - // Get parallelism options - final HashSet> parallelOptions = getParallelism(myClass, new HashSet>()); - root.put("parallel", parallelOptions); - // Get annotation info (what type of annotation, standard etc.) - final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); - root.put("annotinfo", StringUtils.join(annotInfo, ", ")); - // Get annotation field (whether it goes in INFO or FORMAT) - root.put("annotfield", getAnnotField(myClass)); - // Get walker type if applicable - root.put("walkertype", getWalkerType(myClass)); - // Get partition type if applicable - root.put("partitiontype", getPartitionType(myClass)); - // Get read filter annotations (ReadFilters) if applicable - final HashSet> bucket= getReadFilters(myClass, new HashSet>()); - root.put("readfilters", bucket); - // Get default downsampling settings - final HashMap dsSettings = getDownSamplingSettings(myClass, new HashMap()); - root.put("downsampling", dsSettings); - // Get reference window size settings - final HashMap refwindow = getRefWindow(myClass, new HashMap()); - root.put("refwindow", refwindow); - // Get ActiveRegion size settings - final HashMap activeRegion = getActiveRegion(myClass, new HashMap()); - root.put("activeregion", activeRegion); - // anything else? - } else { - // put empty items to avoid blowups - root.put("parallel", new HashSet()); - root.put("annotinfo", ""); - root.put("annotfield", ""); - root.put("walkertype", ""); - root.put("partitiontype", ""); - root.put("readfilters", new HashSet>()); - root.put("downsampling", new HashMap()); - root.put("refwindow", new HashMap()); - root.put("activeregion", new HashMap()); - } - } - - /** - * Utility function that checks which parallelism options are available for an instance of class c. - * - * @param myClass the class to query for the interfaces - * @param parallelOptions an empty HashSet in which to collect the info - * @return a hash set of parallelism options, otherwise an empty set - */ - private HashSet> getParallelism(Class myClass, HashSet> parallelOptions) { - // - // Retrieve interfaces - Class[] implementedInterfaces = myClass.getInterfaces(); - for (Class intfClass : implementedInterfaces) { - final HashMap nugget = new HashMap(); - if (intfClass.getSimpleName().equals("TreeReducible")) { - nugget.put("name", intfClass.getSimpleName()); - nugget.put("arg", HelpConstants.ARG_TREEREDUCIBLE); - nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_TREEREDUCIBLE); - } else if (intfClass.getSimpleName().equals("NanoSchedulable")) { - nugget.put("name", intfClass.getSimpleName()); - nugget.put("arg", HelpConstants.ARG_NANOSCHEDULABLE); - nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_NANOSCHEDULABLE); - } else { - continue; - } - parallelOptions.add(nugget); - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return parallelOptions; - } - return getParallelism(mySuperClass, parallelOptions); - } - - /** - * Utility function that looks up whether the annotation goes in INFO or FORMAT field. - * - * @param myClass the class to query for the interfaces - * @return a String specifying the annotation field - */ - private final String getAnnotField(Class myClass) { - // - // Look up superclasses recursively until we find either - // GenotypeAnnotation or InfoFieldAnnotation - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass == InfoFieldAnnotation.class) { - return "INFO (variant-level)"; - } else if (mySuperClass == GenotypeAnnotation.class) { - return "FORMAT (sample genotype-level)"; - } else if (mySuperClass.getSimpleName().equals("Object")) { - return ""; - } - return getAnnotField(mySuperClass); - } - - /** - * Utility function that determines the annotation type for an instance of class c. - * - * @param myClass the class to query for the interfaces - * @param annotInfo an empty HashSet in which to collect the info - * @return a hash set of the annotation types, otherwise an empty set - */ - private HashSet getAnnotInfo(Class myClass, HashSet annotInfo) { - // - // Retrieve interfaces - Class[] implementedInterfaces = myClass.getInterfaces(); - for (Class intfClass : implementedInterfaces) { - if (intfClass.getName().contains("Annotation")) { - annotInfo.add(intfClass.getSimpleName()); - } - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return annotInfo; - } - return getAnnotInfo(mySuperClass, annotInfo); - } - - /** - * Utility function that determines the default downsampling settings for an instance of class c. - * - * @param myClass the class to query for the settings - * @param dsSettings an empty HashMap in which to collect the info - * @return a hash set of the downsampling settings, otherwise an empty set - */ - private HashMap getDownSamplingSettings(Class myClass, HashMap dsSettings) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(Downsample.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(Downsample.class); - if(thisAnnotation instanceof Downsample) { - final Downsample dsAnnotation = (Downsample) thisAnnotation; - dsSettings.put("by", dsAnnotation.by().toString()); - dsSettings.put("to_cov", dsAnnotation.toCoverage()); - } - } - return dsSettings; - } - - /** - * Utility function that determines the reference window size for an instance of class c. - * - * @param myClass the class to query for the settings - * @param refWindow an empty HashMap in which to collect the info - * @return a HashMap of the window start and stop, otherwise an empty HashMap - */ - private HashMap getRefWindow(Class myClass, HashMap refWindow) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(Reference.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(Reference.class); - if(thisAnnotation instanceof Reference) { - final Reference refAnnotation = (Reference) thisAnnotation; - refWindow.put("start", refAnnotation.window().start()); - refWindow.put("stop", refAnnotation.window().stop()); - } - } - return refWindow; - } - - /** - * Utility function that determines the ActiveRegion settings for an instance of class c. - * - * @param myClass the class to query for the settings - * @param activeRegion an empty HashMap in which to collect the info - * @return a HashMap of the ActiveRegion parameters, otherwise an empty HashMap - */ - private HashMap getActiveRegion(Class myClass, HashMap activeRegion) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(ActiveRegionTraversalParameters.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(ActiveRegionTraversalParameters.class); - if(thisAnnotation instanceof ActiveRegionTraversalParameters) { - final ActiveRegionTraversalParameters arAnnotation = (ActiveRegionTraversalParameters) thisAnnotation; - activeRegion.put("ext", arAnnotation.extension()); - activeRegion.put("max", arAnnotation.maxRegion()); - activeRegion.put("min", arAnnotation.minRegion()); - } - } - return activeRegion; - } - - /** - * Utility function that determines the partition type of an instance of class c. - * - * @param myClass the class to query for the annotation - * @return the partition type if applicable, otherwise an empty string - */ - private String getPartitionType(Class myClass) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(PartitionBy.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(PartitionBy.class); - if(thisAnnotation instanceof PartitionBy) { - final PartitionBy partAnnotation = (PartitionBy) thisAnnotation; - return partAnnotation.value().toString(); - } - } - return ""; - } - - /** - * Utility function that determines the type of walker subclassed by an instance of class c. - * - * @param myClass the class to query for the annotation - * @return the type of walker if applicable, otherwise an empty string - */ - private String getWalkerType(Class myClass) { - // - // Look up superclasses recursively until we find either Walker or Object - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Walker")) { - return myClass.getSimpleName(); - } else if (mySuperClass.getSimpleName().equals("Object")) { - return ""; - } - return getWalkerType(mySuperClass); - } - - /** - * Utility function that finds the values of ReadFilters annotation applied to an instance of class c. - * - * @param myClass the class to query for the annotation - * @param bucket a container in which we store the annotations collected - * @return a hash set of values, otherwise an empty set - */ - private HashSet> getReadFilters(Class myClass, HashSet> bucket) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(ReadFilters.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(ReadFilters.class); - if(thisAnnotation instanceof ReadFilters) { - final ReadFilters rfAnnotation = (ReadFilters) thisAnnotation; - for (Class filter : rfAnnotation.value()) { - // make hashmap of simplename and url - final HashMap nugget = new HashMap(); - nugget.put("name", filter.getSimpleName()); - nugget.put("filename", GATKDocUtils.htmlFilenameForClass(filter)); - bucket.add(nugget); - } - } - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return bucket; - } - return getReadFilters(mySuperClass, bucket); - } - - - /** - * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in - * instance of class c. - * - * @param instance the object to query for the field value - * @param fieldName the name of the field we are looking for in instance - * @return The value assigned to field in the ArgumentCollection, otherwise null - */ - private Object getFieldValue(Object instance, String fieldName) { - // - // subtle note. If you have a field named X that is an ArgumentCollection that - // contains a field X as well, you need only consider fields in the argumentCollection, not - // matching the argument itself. - // - // @ArgumentCollection - // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - // - - for (Field field : JVMUtils.getAllFields(instance.getClass())) { - if (field.isAnnotationPresent(ArgumentCollection.class)) { - //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); - Object fieldValue = JVMUtils.getFieldValue(field, instance); - Object value = getFieldValue(fieldValue, fieldName); - if (value != null) - return value; - } else if (field.getName().equals(fieldName)) { - return JVMUtils.getFieldValue(field, instance); - } - } - - return null; - } - - /** - * Pretty prints value - *

- * Assumes value != null - * - * @param value - * @return - */ - private Object prettyPrintValueString(Object value) { - if (value.getClass().isArray()) { - Class type = value.getClass().getComponentType(); - if (boolean.class.isAssignableFrom(type)) - return Arrays.toString((boolean[]) value); - if (byte.class.isAssignableFrom(type)) - return Arrays.toString((byte[]) value); - if (char.class.isAssignableFrom(type)) - return Arrays.toString((char[]) value); - if (double.class.isAssignableFrom(type)) - return Arrays.toString((double[]) value); - if (float.class.isAssignableFrom(type)) - return Arrays.toString((float[]) value); - if (int.class.isAssignableFrom(type)) - return Arrays.toString((int[]) value); - if (long.class.isAssignableFrom(type)) - return Arrays.toString((long[]) value); - if (short.class.isAssignableFrom(type)) - return Arrays.toString((short[]) value); - if (Object.class.isAssignableFrom(type)) - return Arrays.toString((Object[]) value); - else - throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); - } else if (RodBinding.class.isAssignableFrom(value.getClass())) { - // annoying special case to handle the UnBound() constructor - return "none"; - } else if (value instanceof String) { - return value.equals("") ? "\"\"" : value; - } else { - return value.toString(); - } - } - - /** - * Attempt to instantiate class c, if possible. Returns null if this proves impossible. - * - * @param c - * @return - */ - private Object makeInstanceIfPossible(Class c) { - Object instance = null; - try { - // don't try to make something where we will obviously fail - if (!c.isEnum() && !c.isAnnotation() && !c.isAnonymousClass() && - !c.isArray() && !c.isPrimitive() & JVMUtils.isConcrete(c)) { - instance = c.newInstance(); - //System.out.printf("Created object of class %s => %s%n", c, instance); - return instance; - } else - return null; - } catch (IllegalAccessException e) { - } catch (InstantiationException e) { - } catch (ExceptionInInitializerError e) { - } catch (SecurityException e) { - } - // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions - // and rethrow then as RuntimeExceptions - catch (RuntimeException e) { - } - - return instance; - } - - - /** - * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet - * - * @return - */ - private ParsingEngine createStandardGATKParsingEngine() { - CommandLineProgram clp = new CommandLineGATK(); - try { - CommandLineProgram.start(clp, new String[]{}, true); - return clp.parser; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** - * Gets the javadocs associated with field name in classDoc. Throws a - * runtime exception if this proves impossible. - * - * @param classDoc - * @param name - * @return - */ - private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { - return getFieldDoc(classDoc, name, true); - } - - /** - * Recursive helper routine to getFieldDoc() - * - * @param classDoc - * @param name - * @param primary - * @return - */ - private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { - //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); - for (FieldDoc fieldDoc : classDoc.fields(false)) { - //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); - if (fieldDoc.name().equals(name)) - return fieldDoc; - - Field field = DocletUtils.getFieldForFieldDoc(fieldDoc); - if (field == null) - throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); - if (field.isAnnotationPresent(ArgumentCollection.class)) { - ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); - if (typeDoc == null) - throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); - else { - FieldDoc result = getFieldDoc(typeDoc, name, false); - if (result != null) - return result; - // else keep searching - } - } - } - - // if we didn't find it here, wander up to the superclass to find the field - if (classDoc.superclass() != null) { - return getFieldDoc(classDoc.superclass(), name, false); - } - - if (primary) - throw new RuntimeException("No field found for expected field " + name); - else - return null; - } - - /** - * Returns a Pair of (main, synonym) names for argument with fullName s1 and - * shortName s2. The main is selected to be the longest of the two, provided - * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken. - * - * @param s1 the short argument name without -, or null if not provided - * @param s2 the long argument name without --, or null if not provided - * @return A pair of fully qualified names (with - or --) for the argument. The first - * element is the primary display name while the second (potentially null) is a - * synonymous name. - */ - Pair displayNames(String s1, String s2) { - s1 = s1 == null ? null : "-" + s1; - s2 = s2 == null ? null : "--" + s2; - - if (s1 == null) return new Pair(s2, null); - if (s2 == null) return new Pair(s1, null); - - String l = s1.length() > s2.length() ? s1 : s2; - String s = s1.length() > s2.length() ? s2 : s1; - - if (l.length() > MAX_DISPLAY_NAME) - return new Pair(s, l); - else - return new Pair(l, s); - } - - /** - * Returns a human readable string that describes the Type type of a GATK argument. - *

- * This will include parameterized types, so that Set{T} shows up as Set(T) and not - * just Set in the docs. - * - * @param type - * @return - */ - protected String argumentTypeString(Type type) { - if (type instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType) type; - List subs = new ArrayList(); - for (Type actualType : parameterizedType.getActualTypeArguments()) - subs.add(argumentTypeString(actualType)); - return argumentTypeString(((ParameterizedType) type).getRawType()) + "[" + Utils.join(",", subs) + "]"; - } else if (type instanceof GenericArrayType) { - return argumentTypeString(((GenericArrayType) type).getGenericComponentType()) + "[]"; - } else if (type instanceof WildcardType) { - throw new RuntimeException("We don't support wildcards in arguments: " + type); - } else if (type instanceof Class) { - return ((Class) type).getSimpleName(); - } else { - throw new StingException("Unknown type: " + type); - } - } - - /** - * Helper routine that returns the Feature.class required by a RodBinding, - * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if - * the Type doesn't fit either model. - * - * @param type - * @return - */ - protected Class getFeatureTypeIfPossible(Type type) { - if (type instanceof ParameterizedType) { - ParameterizedType paramType = (ParameterizedType) type; - if (RodBinding.class.isAssignableFrom((Class) paramType.getRawType())) { - return (Class) JVMUtils.getParameterizedTypeClass(type); - } else { - for (Type paramtype : paramType.getActualTypeArguments()) { - Class x = getFeatureTypeIfPossible(paramtype); - if (x != null) - return x; - } - } - } - - return null; - } - - /** - * High-level entry point for creating a FreeMarker map describing the GATK argument - * source with definition def, with associated javadoc fieldDoc. - * - * @param fieldDoc - * @param source - * @param def - * @return a non-null Map binding argument keys with their values - */ - protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { - Map root = new HashMap(); - Pair names = displayNames(def.shortName, def.fullName); - - root.put("name", names.getFirst()); - - if (names.getSecond() != null) - root.put("synonyms", names.getSecond()); - - root.put("required", def.required ? "yes" : "no"); - - // type of the field - root.put("type", argumentTypeString(source.field.getGenericType())); - - Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); - if (featureClass != null) { - // deal with the allowable types - FeatureManager manager = new FeatureManager(); - List rodTypes = new ArrayList(); - for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass)) { - rodTypes.add(String.format("%s", - GATKDocUtils.htmlFilenameForClass(descriptor.getCodecClass()), - descriptor.getName())); - } - - root.put("rodTypes", Utils.join(", ", rodTypes)); - } - - // summary and fulltext - root.put("summary", def.doc != null ? def.doc : ""); - root.put("fulltext", fieldDoc.commentText()); - - // What are our enum options? - if (def.validOptions != null) - root.put("options", docForEnumArgument(source.field.getType())); - - // general attributes - List attributes = new ArrayList(); - if (def.required) attributes.add("required"); - if (source.isDeprecated()) attributes.add("deprecated"); - if (attributes.size() > 0) - root.put("attributes", Utils.join(", ", attributes)); - - return root; - } - - /** - * Helper routine that provides a FreeMarker map for an enumClass, grabbing the - * values of the enum and their associated javadoc documentation. - * - * @param enumClass - * @return - */ - @Requires("enumClass.isEnum()") - private List> docForEnumArgument(final Class enumClass) { - final ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); - if ( doc == null ) - throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got null instead"); - - final Set enumConstantFieldNames = enumConstantsNames(enumClass); - - final List> bindings = new ArrayList>(); - for (final FieldDoc fieldDoc : doc.fields(false)) { - if (enumConstantFieldNames.contains(fieldDoc.name()) ) - bindings.add( - new HashMap() {{ - put("name", fieldDoc.name()); - put("summary", fieldDoc.commentText()); - }}); - } - - return bindings; - } - - /** - * Returns the name of the fields that are enum constants according to reflection - * - * @return a non-null set of fields that are enum constants - */ - private Set enumConstantsNames(final Class enumClass) { - final Set enumConstantFieldNames = new HashSet(); - - for ( final Field field : enumClass.getFields() ) { - if ( field.isEnumConstant() ) - enumConstantFieldNames.add(field.getName()); - } - - return enumConstantFieldNames; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java deleted file mode 100644 index 2ed35d848..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java +++ /dev/null @@ -1,64 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -public class HelpConstants { - - public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; - public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; - public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; - public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; - - /** - * Arguments for parallelism options - */ - public final static String ARG_TREEREDUCIBLE = "-nt"; - public final static String ARG_NANOSCHEDULABLE = "-nct"; - public final static String CMDLINE_GATK_URL = GATK_DOCS_URL + "org_broadinstitute_sting_gatk_CommandLineGATK.html"; - - /** - * Definition of the group names / categories of tools. - * The names get parsed to make supercategories in the doc index, - * so be careful when making big changes -- see GATKDoclet.java toMap() - */ - public final static String DOCS_CAT_DATA = "Sequence Data Processing Tools"; - public final static String DOCS_CAT_QC = "Diagnostics and Quality Control Tools"; - public final static String DOCS_CAT_ENGINE = "Engine Parameters (available to all tools)"; - public final static String DOCS_CAT_RF = "Read Filters"; - public final static String DOCS_CAT_REFUTILS = "Reference Utilities"; - public final static String DOCS_CAT_RODCODECS = "ROD Codecs"; - public final static String DOCS_CAT_USRERR = "User Exceptions"; - public final static String DOCS_CAT_VALIDATION = "Validation Utilities"; - public final static String DOCS_CAT_ANNOT = "Variant Annotations"; - public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; - public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; - public final static String DOCS_CAT_TEST = "Testing Tools"; - public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; - - public static String forumPost(String post) { - return GATK_FORUM_URL + post; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java deleted file mode 100644 index d700bff28..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java +++ /dev/null @@ -1,317 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; -/** - * Print out help for Sting command-line applications. - */ - -public class HelpFormatter { - /** our log, which we want to capture anything from org.broadinstitute.sting */ - private static Logger logger = Logger.getLogger(HelpFormatter.class); - - public static final int FIELD_SEPARATION_WIDTH = 3; - - /** - * Prints the help, given a collection of argument definitions. - * @param applicationDetails Application details - * @param argumentDefinitions Argument definitions for which help should be printed. - */ - public void printHelp( ApplicationDetails applicationDetails, ArgumentDefinitions argumentDefinitions ) { - List argumentGroups = prepareArgumentGroups( argumentDefinitions ); - - List header = applicationDetails.applicationHeader; - String barrier = createBarrier(header); - - System.out.printf("%s%n",barrier); - for(String headerLine: header) - System.out.printf("%s%n",headerLine); - System.out.printf("%s%n",barrier); - for(String attributionLine: applicationDetails.attribution) - System.out.printf("%s%n",attributionLine); - System.out.printf("%s%n",barrier); - - String synopsis = getSynopsis(applicationDetails.runningInstructions,argumentGroups); - String additionalDetails = applicationDetails.additionalHelp != null ? applicationDetails.additionalHelp : ""; - String detailedDescription = getDetailed(argumentGroups); - - System.out.printf("%s%n%s%n%s%n",synopsis,detailedDescription,additionalDetails ); - } - - /** - * Gets the synopsis: the actual command to run. - * @param runningInstructions Instructions on how to run hte application. - * @param argumentGroups Program arguments sorted in order of definition group displays. - * @return A synopsis line. - */ - private String getSynopsis( String runningInstructions, - List argumentGroups ) { - // Build out the synopsis all as one long line. - StringBuilder lineBuilder = new StringBuilder(); - Formatter lineFormatter = new Formatter( lineBuilder ); - - lineFormatter.format("java %s", runningInstructions); - - for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) { - for( ArgumentDefinition argumentDefinition: argumentGroup.argumentDefinitions ) { - if(argumentDefinition.isHidden) - continue; - lineFormatter.format(" "); - if( !argumentDefinition.required ) lineFormatter.format("["); - if( argumentDefinition.shortName != null ) - lineFormatter.format("-%s", argumentDefinition.shortName); - else - lineFormatter.format("--%s", argumentDefinition.fullName); - if( !argumentDefinition.isFlag ) - lineFormatter.format(" <%s>", argumentDefinition.fullName); - if( !argumentDefinition.required ) lineFormatter.format("]"); - } - } - - // Word wrap the synopsis. - List wrappedSynopsis = TextFormattingUtils.wordWrap( lineBuilder.toString(), TextFormattingUtils.DEFAULT_LINE_WIDTH ); - - String header = "usage: "; - int headerLength = header.length(); - - StringBuilder synopsisBuilder = new StringBuilder(); - Formatter synopsisFormatter = new Formatter(synopsisBuilder); - for( String synopsisLine: wrappedSynopsis ) { - synopsisFormatter.format("%" + headerLength + "s%s%n", header, synopsisLine); - header = ""; - } - - return synopsisBuilder.toString(); - } - - /** - * Gets detailed output about each argument type. - * @param argumentGroups Collection of program arguments sorted according to how they should be shown. - * @return Detailed text about all arguments. - */ - private String getDetailed( List argumentGroups ) { - StringBuilder builder = new StringBuilder(); - - for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) - builder.append( getDetailForGroup( argumentGroup ) ); - - return builder.toString(); - } - - /** - * Gets a detailed description for a given argument group. - * @param argumentDefinitionGroup The group of argument definitions to render. - * @return A string giving detailed info about the contents of this group. - */ - private String getDetailForGroup( ArgumentDefinitionGroup argumentDefinitionGroup ) { - if(argumentDefinitionGroup.allHidden()) - return ""; - - StringBuilder builder = new StringBuilder(); - Formatter formatter = new Formatter( builder ); - - if( argumentDefinitionGroup.groupName != null && argumentDefinitionGroup.argumentDefinitions.size() != 0 ) - builder.append( String.format("%nArguments for %s:%n", argumentDefinitionGroup.groupName ) ); - - List argumentDefinitions = new ArrayList(); - for(ArgumentDefinition argumentDefinition: argumentDefinitionGroup.argumentDefinitions) { - if(!argumentDefinition.isHidden) - argumentDefinitions.add(argumentDefinition); - } - - // Try to fit the entire argument definition across the screen, but impose an arbitrary cap of 3/4 * - // LINE_WIDTH in case the length of the arguments gets out of control. - int argWidth = Math.min( findLongestArgumentCallingInfo(argumentDefinitions), (TextFormattingUtils.DEFAULT_LINE_WIDTH*3)/4 - FIELD_SEPARATION_WIDTH ); - int docWidth = TextFormattingUtils.DEFAULT_LINE_WIDTH - argWidth - FIELD_SEPARATION_WIDTH; - - for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { - Iterator wordWrappedArgs = TextFormattingUtils.wordWrap( getArgumentCallingInfo(argumentDefinition), argWidth ).iterator(); - Iterator wordWrappedDoc = TextFormattingUtils.wordWrap( getArgumentDoc(argumentDefinition), docWidth ).iterator(); - - while( wordWrappedArgs.hasNext() || wordWrappedDoc.hasNext() ) { - String arg = wordWrappedArgs.hasNext() ? wordWrappedArgs.next() : ""; - String doc = wordWrappedDoc.hasNext() ? wordWrappedDoc.next() : ""; - - String formatString = "%-" + argWidth + "s%" + FIELD_SEPARATION_WIDTH + "s%s%n"; - formatter.format( formatString, arg, "", doc ); - } - } - - return builder.toString(); - } - - /** - * Gets a string indicating how this argument should be passed to the application. - * @param argumentDefinition Argument definition for which help should be printed. - * @return Calling information for this argument. - */ - private String getArgumentCallingInfo( ArgumentDefinition argumentDefinition ) { - StringBuilder builder = new StringBuilder(); - Formatter formatter = new Formatter( builder ); - - formatter.format(" "); - if( argumentDefinition.shortName != null ) - formatter.format("-%s,", argumentDefinition.shortName); - formatter.format("--%s", argumentDefinition.fullName); - if( !argumentDefinition.isFlag ) - formatter.format(" <%s>", argumentDefinition.fullName); - - return builder.toString(); - } - - /** - * Gets a string of argument documentation. - * @param argumentDefinition Argument definition for which help should be printed. - * @return Brief description for this argument. - */ - private String getArgumentDoc( ArgumentDefinition argumentDefinition ) { - StringBuilder builder = new StringBuilder(); - builder.append(argumentDefinition.doc); - if( argumentDefinition.validOptions != null ) { - builder.append(" ("); - builder.append(Utils.join("|",argumentDefinition.validOptions)); - builder.append(")"); - } - return builder.toString(); - } - - /** - * Crude implementation which finds the longest argument portion - * given a set of arguments. - * @param argumentDefinitions argument definitions to inspect. - * @return longest argument length. - */ - private int findLongestArgumentCallingInfo( Collection argumentDefinitions ) { - int longest = 0; - for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { - String argumentText = getArgumentCallingInfo( argumentDefinition ); - if( longest < argumentText.length() ) - longest = argumentText.length(); - } - return longest; - } - - /** - * Extract the argument definition groups from the argument definitions and arrange them appropriately. - * For help, we want the arguments sorted as they are declared in the class. However, required arguments - * should appear before optional arguments. - * @param argumentDefinitions Argument definitions from which to extract argument groups. - * @return A list of argument groups sorted in display order. - */ - private List prepareArgumentGroups( ArgumentDefinitions argumentDefinitions ) { - // Sort the list of argument definitions according to how they should be shown. - // Put the sorted results into a new cloned data structure. - Comparator definitionComparator = new Comparator() { - public int compare( ArgumentDefinition lhs, ArgumentDefinition rhs ) { - if( lhs.required && rhs.required ) return 0; - if( lhs.required ) return -1; - if( rhs.required ) return 1; - return 0; - } - }; - - List argumentGroups = new ArrayList(); - for( ArgumentDefinitionGroup argumentGroup: argumentDefinitions.getArgumentDefinitionGroups() ) { - List sortedDefinitions = new ArrayList( argumentGroup.argumentDefinitions ); - Collections.sort( sortedDefinitions, definitionComparator ); - argumentGroups.add( new ArgumentDefinitionGroup(argumentGroup.groupName,sortedDefinitions) ); - } - - // Sort the argument groups themselves with main arguments first, followed by plugins sorted in name order. - Comparator groupComparator = new Comparator() { - public int compare( ArgumentDefinitionGroup lhs, ArgumentDefinitionGroup rhs ) { - if( lhs.groupName == null && rhs.groupName == null ) return 0; - if( lhs.groupName == null ) return -1; - if( rhs.groupName == null ) return 1; - return lhs.groupName.compareTo(rhs.groupName); - } - }; - Collections.sort( argumentGroups, groupComparator ); - - - return argumentGroups; - } - - /** - * generateHeaderInformation - *

- *

- * Generate a standard header for the logger - * - * @param applicationDetails details of the application to run. - * @param parsedArgs the arguments passed in - */ - public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map parsedArgs) { - - DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); - java.util.Date date = new java.util.Date(); - - String barrier = createBarrier(applicationDetails.applicationHeader); - - logger.info(barrier); - for (String headerLine : applicationDetails.applicationHeader) - logger.info(headerLine); - logger.debug("Current directory: " + System.getProperty("user.dir")); - for (Map.Entry entry: parsedArgs.entrySet()) { - ArgumentMatchSource matchSource = entry.getKey(); - final String sourceName; - switch (matchSource.getType()) { - case CommandLine: sourceName = "Program"; break; - case Provider: sourceName = matchSource.getDescription(); break; - default: throw new RuntimeException("Unexpected argument match source type: " + matchSource.getType()); - } - - String output = sourceName + " Args: " + entry.getValue().getDescription(); - logger.info(output); - } - logger.info("Date/Time: " + dateFormat.format(date)); - logger.info(barrier); - - for(String attribution: applicationDetails.attribution) - logger.info(attribution); - logger.info(barrier); - } - - /** - * Create a barrier to use to distinguish the header from the rest of the output. - * @param text A collection of lines to output as part of a header. - * @return A barrier consisting of the '-' character. - */ - private static String createBarrier(List text) { - int barrierWidth = 0; - for(String headerLine: text) - barrierWidth = Math.max(headerLine.length(),barrierWidth); - return String.format("%0" + barrierWidth + "d",0).replace('0','-'); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java deleted file mode 100644 index 86f3500be..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java +++ /dev/null @@ -1,379 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.locusiterator; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * Steps a single read along its alignment to the genome - * - * The logical model for generating extended events is as follows: the "record state" - * implements the traversal along the reference; thus stepForwardOnGenome() returns - * on every and only on actual reference bases. This can be a (mis)match or a deletion - * (in the latter case, we still return on every individual reference base the deletion spans). - * - * User: depristo - * Date: 1/5/13 - * Time: 1:08 PM - */ -@Invariant({ - "nCigarElements >= 0", - "cigar != null", - "read != null", - "currentCigarElementOffset >= -1", - "currentCigarElementOffset <= nCigarElements" -}) -public class AlignmentStateMachine { - /** - * Our read - */ - private final GATKSAMRecord read; - private final Cigar cigar; - private final int nCigarElements; - private int currentCigarElementOffset = -1; - - /** - * how far are we offset from the start of the read bases? - */ - private int readOffset; - - /** - * how far are we offset from the alignment start on the genome? - */ - private int genomeOffset; - - /** - * Our cigar element - */ - private CigarElement currentElement; - - /** - * how far are we into our cigarElement? - */ - private int offsetIntoCurrentCigarElement; - - @Requires({"read != null", "read.getAlignmentStart() != -1", "read.getCigar() != null"}) - public AlignmentStateMachine(final GATKSAMRecord read) { - this.read = read; - this.cigar = read.getCigar(); - this.nCigarElements = cigar.numCigarElements(); - initializeAsLeftEdge(); - } - - /** - * Initialize the state variables to put this machine one bp before the - * start of the alignment, so that a call to stepForwardOnGenome() will advance - * us to the first proper location - */ - @Ensures("isLeftEdge()") - private void initializeAsLeftEdge() { - readOffset = offsetIntoCurrentCigarElement = genomeOffset = -1; - currentElement = null; - } - - /** - * Get the read we are aligning to the genome - * @return a non-null GATKSAMRecord - */ - @Ensures("result != null") - public GATKSAMRecord getRead() { - return read; - } - - /** - * Get the reference index of the underlying read - * - * @return the reference index of the read - */ - @Ensures("result == getRead().getReferenceIndex()") - public int getReferenceIndex() { - return getRead().getReferenceIndex(); - } - - /** - * Is our read a reduced read? - * - * @return true if the read we encapsulate is a reduced read, otherwise false - */ - public boolean isReducedRead() { - return read.isReducedRead(); - } - - /** - * Is this the left edge state? I.e., one that is before or after the current read? - * @return true if this state is an edge state, false otherwise - */ - public boolean isLeftEdge() { - return readOffset == -1; - } - - /** - * Are we on the right edge? I.e., is the current state off the right of the alignment? - * @return true if off the right edge, false if otherwise - */ - public boolean isRightEdge() { - return readOffset == read.getReadLength(); - } - - /** - * What is our current offset in the read's bases that aligns us with the reference genome? - * - * @return the current read offset position. If an edge will be == -1 - */ - @Ensures("result >= -1") - public int getReadOffset() { - return readOffset; - } - - /** - * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? - * - * @return the current offset from the alignment start on the genome. If this state is - * at the left edge the result will be -1; - */ - @Ensures("result >= -1") - public int getGenomeOffset() { - return genomeOffset; - } - - /** - * Get the position (1-based as standard) of the current alignment on the genome w.r.t. the read's alignment start - * @return the position on the genome of the current state in absolute coordinates - */ - @Ensures("result > 0") - public int getGenomePosition() { - return read.getAlignmentStart() + getGenomeOffset(); - } - - /** - * Gets #getGenomePosition but as a 1 bp GenomeLoc - * @param genomeLocParser the parser to use to create the genome loc - * @return a non-null genome location with start position of getGenomePosition - */ - @Requires("genomeLocParser != null") - @Ensures("result != null") - public GenomeLoc getLocation(final GenomeLocParser genomeLocParser) { - // TODO -- may return wonky results if on an edge (could be 0 or could be beyond genome location) - return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); - } - - /** - * Get the cigar element we're currently aligning with. - * - * For example, if the cigar string is 2M2D2M and we're in the second step of the - * first 2M, then this function returns the element 2M. After calling stepForwardOnGenome - * this function would return 2D. - * - * @return the cigar element, or null if we're the left edge - */ - @Ensures("result != null || isLeftEdge() || isRightEdge()") - public CigarElement getCurrentCigarElement() { - return currentElement; - } - - /** - * Get the offset of the current cigar element among all cigar elements in the read - * - * Suppose our read's cigar is 1M2D3M, and we're at the first 1M. This would - * return 0. Stepping forward puts us in the 2D, so our offset is 1. Another - * step forward would result in a 1 again (we're in the second position of the 2D). - * Finally, one more step forward brings us to 2 (for the 3M element) - * - * @return the offset of the current cigar element in the reads's cigar. Will return -1 for - * when the state is on the left edge, and be == the number of cigar elements in the - * read when we're past the last position on the genome - */ - @Ensures({"result >= -1", "result <= nCigarElements"}) - public int getCurrentCigarElementOffset() { - return currentCigarElementOffset; - } - - /** - * Get the offset of the current state into the current cigar element - * - * That is, suppose we have a read with cigar 2M3D4M, and we're right at - * the second M position. offsetIntoCurrentCigarElement would be 1, as - * it's two elements into the 2M cigar. Now stepping forward we'd be - * in cigar element 3D, and our offsetIntoCurrentCigarElement would be 0. - * - * @return the offset (from 0) of the current state in the current cigar element. - * Will be 0 on the right edge, and -1 on the left. - */ - @Ensures({"result >= 0 || (result == -1 && isLeftEdge())", "!isRightEdge() || result == 0"}) - public int getOffsetIntoCurrentCigarElement() { - return offsetIntoCurrentCigarElement; - } - - /** - * Convenience accessor of the CigarOperator of the current cigar element - * - * Robust to the case where we're on the edge, and currentElement is null, in which - * case this function returns null as well - * - * @return null if this is an edge state - */ - @Ensures("result != null || isLeftEdge() || isRightEdge()") - public CigarOperator getCigarOperator() { - return currentElement == null ? null : currentElement.getOperator(); - } - - @Override - public String toString() { - return String.format("%s ro=%d go=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, offsetIntoCurrentCigarElement, currentElement); - } - - // ----------------------------------------------------------------------------------------------- - // - // Code for setting up prev / next states - // - // ----------------------------------------------------------------------------------------------- - - /** - * Step the state machine forward one unit - * - * Takes the current state of this machine, and advances the state until the next on-genome - * cigar element (M, X, =, D) is encountered, at which point this function returns with the - * cigar operator of the current element. - * - * Assumes that the AlignmentStateMachine is in the left edge state at the start, so that - * stepForwardOnGenome() can be called to move the machine to the first alignment position. That - * is, the normal use of this code is: - * - * AlignmentStateMachine machine = new AlignmentStateMachine(read) - * machine.stepForwardOnGenome() - * // now the machine is at the first position on the genome - * - * When stepForwardOnGenome() advances off the right edge of the read, the state machine is - * left in a state such that isRightEdge() returns true and returns null, indicating the - * the machine cannot advance further. The machine may explode, though this is not contracted, - * if stepForwardOnGenome() is called after a previous call returned null. - * - * @return the operator of the cigar element that machine stopped at, null if we advanced off the end of the read - */ - @Ensures("result != null || isRightEdge()") - public CigarOperator stepForwardOnGenome() { - // loop until we either find a cigar element step that moves us one base on the genome, or we run - // out of cigar elements - while ( true ) { - // we enter this method with readOffset = index of the last processed base on the read - // (-1 if we did not process a single base yet); this can be last matching base, - // or last base of an insertion - if (currentElement == null || (offsetIntoCurrentCigarElement + 1) >= currentElement.getLength()) { - currentCigarElementOffset++; - if (currentCigarElementOffset < nCigarElements) { - currentElement = cigar.getCigarElement(currentCigarElementOffset); - offsetIntoCurrentCigarElement = -1; - // next line: guards against cigar elements of length 0; when new cigar element is retrieved, - // we reenter in order to re-check offsetIntoCurrentCigarElement against currentElement's length - continue; - } else { - if (currentElement != null && currentElement.getOperator() == CigarOperator.D) - throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); - - // we're done, so set the offset of the cigar to 0 for cleanliness, as well as the current element - offsetIntoCurrentCigarElement = 0; - readOffset = read.getReadLength(); - currentElement = null; - - // Reads that contain indels model the genomeOffset as the following base in the reference. Because - // we fall into this else block only when indels end the read, increment genomeOffset such that the - // current offset of this read is the next ref base after the end of the indel. This position will - // model a point on the reference somewhere after the end of the read. - genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: - - // we do step forward on the ref, and by returning null we also indicate that we are past the read end. - return null; - } - } - - offsetIntoCurrentCigarElement++; - boolean done = false; - switch (currentElement.getOperator()) { - case H: // ignore hard clips - case P: // ignore pads - offsetIntoCurrentCigarElement = currentElement.getLength(); - break; - case I: // insertion w.r.t. the reference - case S: // soft clip - offsetIntoCurrentCigarElement = currentElement.getLength(); - readOffset += currentElement.getLength(); - break; - case D: // deletion w.r.t. the reference - if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string - throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); - // should be the same as N case - genomeOffset++; - done = true; - break; - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - genomeOffset++; - done = true; - break; - case M: - case EQ: - case X: - readOffset++; - genomeOffset++; - done = true; - break; - default: - throw new IllegalStateException("Case statement didn't deal with cigar op: " + currentElement.getOperator()); - } - - if ( done ) - return currentElement.getOperator(); - } - } - - /** - * Create a new PileupElement based on the current state of this element - * - * Must not be a left or right edge - * - * @return a pileup element - */ - @Ensures("result != null") - public final PileupElement makePileupElement() { - if ( isLeftEdge() || isRightEdge() ) - throw new IllegalStateException("Cannot make a pileup element from an edge alignment state"); - return new PileupElement(read, - getReadOffset(), - getCurrentCigarElement(), - getCurrentCigarElementOffset(), - getOffsetIntoCurrentCigarElement()); - } -} - diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java deleted file mode 100644 index b83a15d6d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ /dev/null @@ -1,236 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.Arrays; - -import static java.lang.Math.log10; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * - * User: rpoplin, carneiro - * Date: 3/1/12 - */ -public class Log10PairHMM extends N2MemoryPairHMM { - /** - * Should we use exact log10 calculation (true), or an approximation (false)? - */ - private final boolean doExactLog10; - - protected static final int matchToMatch = 0; - protected static final int indelToMatch = 1; - protected static final int matchToInsertion = 2; - protected static final int insertionToInsertion = 3; - protected static final int matchToDeletion = 4; - protected static final int deletionToDeletion = 5; - - // we divide e by 3 because the observed base could have come from any of the non-observed alleles - protected final static double log10_3 = log10(3.0); - - /** - * Create an uninitialized PairHMM - * - * @param doExactLog10 should the log10 calculations be exact (slow) or approximate (faster) - */ - public Log10PairHMM(final boolean doExactLog10) { - this.doExactLog10 = doExactLog10; - } - - /** - * Is this HMM using exact log10 calculations? - * @return true if exact, false if approximate - */ - public boolean isDoingExactLog10Calculations() { - return doExactLog10; - } - - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - for( int iii=0; iii < paddedMaxReadLength; iii++ ) { - Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); - } - } - - /** - * {@inheritDoc} - */ - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex) { - - - if ( ! constantsAreInitialized || recacheReadValues ) - initializeProbabilities(insertionGOP, deletionGOP, overallGCP); - initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { - // set the initial value (free deletions in the beginning) for the first row in the deletion matrix - initializeMatrixValues(haplotypeBases); - } - - for (int i = 1; i < paddedReadLength; i++) { - // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based - for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { - updateCell(i, j, prior[i][j], transition[i]); - } - } - - // final probability is the log10 sum of the last element in the Match and Insertion state arrays - // this way we ignore all paths that ended in deletions! (huge) - // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. - double finalSumProbabilities = finalLikelihoodCalculation(); - - return finalSumProbabilities; - } - - protected void initializeMatrixValues(final byte[] haplotypeBases) { - final double initialValue = Math.log10(1.0 / haplotypeBases.length); - for( int j = 0; j < paddedHaplotypeLength; j++ ) { - deletionMatrix[0][j] = initialValue; - } - } - - protected double finalLikelihoodCalculation() { - final int endI = paddedReadLength - 1; - double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); - for (int j = 2; j < paddedHaplotypeLength; j++) - finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); - return finalSumProbabilities; - } - - - /** - * Initializes the matrix that holds all the constants related to the editing - * distance between the read and the haplotype. - * - * @param haplotypeBases the bases of the haplotype - * @param readBases the bases of the read - * @param readQuals the base quality scores of the read - * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) - */ - public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { - - // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases - // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. - - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = startIndex; j < haplotypeBases.length; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) ); - } - } - } - - /** - * Initializes the matrix that holds all the constants related to quality scores. - * - * @param insertionGOP insertion quality scores of the read - * @param deletionGOP deletion quality scores of the read - * @param overallGCP overall gap continuation penalty - */ - @Requires({ - "insertionGOP != null", - "deletionGOP != null", - "overallGCP != null" - }) - @Ensures("constantsAreInitialized") - protected void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - for (int i = 0; i < insertionGOP.length; i++) { - final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); - transition[i+1][indelToMatch] = QualityUtils.qualToProbLog10(overallGCP[i]); - transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); - transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); - transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); - transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); - } - - // note that we initialized the constants - constantsAreInitialized = true; - } - - - /** - * Compute the log10SumLog10 of the values - * - * NOTE NOTE NOTE - * - * Log10PairHMM depends critically on this function tolerating values that are all -Infinity - * and the sum returning -Infinity. Note good. Needs to be fixed. - * - * NOTE NOTE NOTE - * - * @param values an array of log10 probabilities that need to be summed - * @return the log10 of the sum of the probabilities - */ - @Requires("values != null") - protected double myLog10SumLog10(final double[] values) { - return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); - } - - /** - * Updates a cell in the HMM matrix - * - * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the - * initial conditions - - * @param indI row index in the matrices to update - * @param indJ column index in the matrices to update - * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transition an array with the six transition relevant to this location - */ - protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - - matchMatrix[indI][indJ] = prior + - myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], - insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch], - deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]}); - insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]}); - deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]}); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java deleted file mode 100644 index 18cb9054b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Requires; - -/** - * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix - * - * User: rpoplin - * Date: 10/16/12 - */ -abstract class N2MemoryPairHMM extends PairHMM { - protected double[][] transition = null; // The transition probabilities cache - protected double[][] prior = null; // The prior probabilities cache - protected double[][] matchMatrix = null; - protected double[][] insertionMatrix = null; - protected double[][] deletionMatrix = null; - - // only used for debugging purposes - protected boolean doNotUseTristateCorrection = false; - - public void doNotUseTristateCorrection() { - doNotUseTristateCorrection = true; - } - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * - * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. - * - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - } - - /** - * Print out the core hmm matrices for debugging - */ - protected void dumpMatrices() { - dumpMatrix("matchMetricArray", matchMatrix); - dumpMatrix("insertionMatrix", insertionMatrix); - dumpMatrix("deletionMatrix", deletionMatrix); - } - - /** - * Print out in a human readable form the matrix for debugging - * @param name the name of this matrix - * @param matrix the matrix of values - */ - @Requires({"name != null", "matrix != null"}) - private void dumpMatrix(final String name, final double[][] matrix) { - System.out.printf("%s%n", name); - for ( int i = 0; i < matrix.length; i++) { - System.out.printf("\t%s[%d]", name, i); - for ( int j = 0; j < matrix[i].length; j++ ) { - if ( Double.isInfinite(matrix[i][j]) ) - System.out.printf(" %15s", String.format("%f", matrix[i][j])); - else - System.out.printf(" % 15.5e", matrix[i][j]); - } - System.out.println(); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java deleted file mode 100644 index ff883c5ae..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ /dev/null @@ -1,271 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * - * User: rpoplin - * Date: 10/16/12 - */ -public abstract class PairHMM { - protected final static Logger logger = Logger.getLogger(PairHMM.class); - - protected boolean constantsAreInitialized = false; - - protected byte[] previousHaplotypeBases; - protected int hapStartIndex; - - public enum HMM_IMPLEMENTATION { - /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ - EXACT, - /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ - ORIGINAL, - /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ - LOGLESS_CACHING, - /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ - ARRAY_LOGLESS - } - - protected int maxHaplotypeLength, maxReadLength; - protected int paddedMaxReadLength, paddedMaxHaplotypeLength; - protected int paddedReadLength, paddedHaplotypeLength; - protected boolean initialized = false; - - // only used for debugging purposes - protected boolean doNotUseTristateCorrection = false; - protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * - * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. - * - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { - if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); - if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); - - maxHaplotypeLength = haplotypeMaxLength; - maxReadLength = readMaxLength; - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - paddedMaxReadLength = readMaxLength + 1; - paddedMaxHaplotypeLength = haplotypeMaxLength + 1; - - previousHaplotypeBases = null; - - constantsAreInitialized = false; - initialized = true; - } - - protected int findMaxReadLength(final List reads) { - int listMaxReadLength = 0; - for(GATKSAMRecord read : reads){ - final int readLength = read.getReadLength(); - if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } - } - return listMaxReadLength; - } - - protected int findMaxHaplotypeLength(final Map haplotypeMap) { - int listMaxHaplotypeLength = 0; - for( final Allele a: haplotypeMap.keySet() ) { - final Haplotype h = haplotypeMap.get(a); - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } - } - return listMaxHaplotypeLength; - } - - /** - * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from - * each haplotype given base substitution, insertion, and deletion probabilities. - * - * @param reads the list of reads - * @param alleleHaplotypeMap the list of haplotypes - * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. - * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of - * said read coming from the said haplotype under the provided error model - */ - public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { - - // (re)initialize the pairHMM only if necessary - final int readMaxLength = findMaxReadLength(reads); - final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); - if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } - - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - for(GATKSAMRecord read : reads){ - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = GCPArrayMap.get(read); - - // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) - byte[] currentHaplotypeBases = null; - boolean isFirstHaplotype = true; - Allele currentAllele = null; - double log10l; - for (final Allele allele : alleleHaplotypeMap.keySet()){ - final Haplotype haplotype = alleleHaplotypeMap.get(allele); - final byte[] nextHaplotypeBases = haplotype.getBases(); - if (currentHaplotypeBases != null) { - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - likelihoodMap.add(read, currentAllele, log10l); - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = allele; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - - // there is no next haplotype, so pass null for nextHaplotypeBases. - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); - likelihoodMap.add(read, currentAllele, log10l); - } - } - return likelihoodMap; - } - - /** - * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion - * probabilities. - * - * Note on using hapStartIndex. This allows you to compute the exact true likelihood of a full haplotypes - * given a read, assuming that the previous calculation read over a full haplotype, recaching the read values, - * starting only at the place where the new haplotype bases and the previous haplotype bases different. This - * index is 0-based, and can be computed with findFirstPositionWhereHaplotypesDiffer given the two haplotypes. - * Note that this assumes that the read and all associated quals values are the same. - * - * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length - * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length - * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases - * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases - * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases - * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases - * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated - * parameters are the same, and only the haplotype bases are changing underneath us - * @return the log10 probability of read coming from the haplotype under the provided error model - */ - protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final boolean recacheReadValues, - final byte[] nextHaploytpeBases) { - - if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); - if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); - if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); - if ( readBases == null ) throw new IllegalArgumentException("readBases cannot be null"); - if ( readBases.length > maxReadLength ) throw new IllegalArgumentException("readBases is too long, got " + readBases.length + " but max is " + maxReadLength); - if ( readQuals.length != readBases.length ) throw new IllegalArgumentException("Read bases and read quals aren't the same size: " + readBases.length + " vs " + readQuals.length); - if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); - if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); - if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); - - paddedReadLength = readBases.length + 1; - paddedHaplotypeLength = haplotypeBases.length + 1; - - hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; - - // Pre-compute the difference between the current haplotype and the next one to be run - // Looking ahead is necessary for the ArrayLoglessPairHMM implementation - final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); - - double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); - - if ( ! MathUtils.goodLog10Probability(result) ) - throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); - - // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). - // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. - previousHaplotypeBases = haplotypeBases; - - // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype - // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart - hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; - - return result; - } - - /** - * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 - */ - @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", - "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) - protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex); - - /** - * Compute the first position at which two haplotypes differ - * - * If the haplotypes are exact copies of each other, returns the min length of the two haplotypes. - * - * @param haplotype1 the first haplotype1 - * @param haplotype2 the second haplotype1 - * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same - */ - public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { - if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); - if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); - - for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { - if( haplotype1[iii] != haplotype2[iii] ) { - return iii; - } - } - - return Math.min(haplotype1.length, haplotype2.length); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java deleted file mode 100644 index 8a034dde0..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ /dev/null @@ -1,578 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pileup; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Arrays; -import java.util.EnumSet; -import java.util.LinkedList; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Apr 14, 2009 - * Time: 8:54:05 AM - */ -public class PileupElement implements Comparable { - private final static LinkedList EMPTY_LINKED_LIST = new LinkedList<>(); - - private final static EnumSet ON_GENOME_OPERATORS = - EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); - - public static final byte DELETION_BASE = BaseUtils.Base.D.base; - public static final byte DELETION_QUAL = (byte) 16; - public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; - public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88; - public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89; - public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90; - - protected final GATKSAMRecord read; // the read this base belongs to - protected final int offset; // the offset in the bases array for this base - - private final CigarElement currentCigarElement; - private final int currentCigarOffset; - private final int offsetInCurrentCigar; - - /** - * Create a new pileup element - * - * @param read a non-null read to pileup - * @param baseOffset the offset into the read's base / qual vector aligned to this position on the genome. If the - * current cigar element is a deletion, offset should be the offset of the last M/=/X position. - * @param currentElement a non-null CigarElement that indicates the cigar element aligning the read to the genome - * @param currentCigarOffset the offset of currentElement in read.getCigar().getElement(currentCigarOffset) == currentElement) - * @param offsetInCurrentCigar how far into the currentElement are we in our alignment to the genome? - */ - public PileupElement(final GATKSAMRecord read, final int baseOffset, - final CigarElement currentElement, final int currentCigarOffset, - final int offsetInCurrentCigar) { - assert currentElement != null; - - this.read = read; - this.offset = baseOffset; - this.currentCigarElement = currentElement; - this.currentCigarOffset = currentCigarOffset; - this.offsetInCurrentCigar = offsetInCurrentCigar; - - // for performance regions these are assertions - assert this.read != null; - assert this.offset >= 0 && this.offset < this.read.getReadLength(); - assert this.currentCigarOffset >= 0; - assert this.currentCigarOffset < read.getCigarLength(); - assert this.offsetInCurrentCigar >= 0; - assert this.offsetInCurrentCigar < currentElement.getLength(); - } - - /** - * Create a new PileupElement that's a copy of toCopy - * @param toCopy the element we want to copy - */ - public PileupElement(final PileupElement toCopy) { - this(toCopy.read, toCopy.offset, toCopy.currentCigarElement, toCopy.currentCigarOffset, toCopy.offsetInCurrentCigar); - } - - /** - * Is this element a deletion w.r.t. the reference genome? - * - * @return true if this is a deletion, false otherwise - */ - public boolean isDeletion() { - return currentCigarElement.getOperator() == CigarOperator.D; - } - - /** - * Is the current element immediately before a deletion, but itself not a deletion? - * - * Suppose we are aligning a read with cigar 3M2D1M. This function is true - * if we are in the last cigar position of the 3M, but not if we are in the 2D itself. - * - * @return true if the next alignment position is a deletion w.r.t. the reference genome - */ - public boolean isBeforeDeletionStart() { - return ! isDeletion() && atEndOfCurrentCigar() && hasOperator(getNextOnGenomeCigarElement(), CigarOperator.D); - } - - /** - * Is the current element immediately after a deletion, but itself not a deletion? - * - * Suppose we are aligning a read with cigar 1M2D3M. This function is true - * if we are in the first cigar position of the 3M, but not if we are in the 2D itself or - * in any but the first position of the 3M. - * - * @return true if the previous alignment position is a deletion w.r.t. the reference genome - */ - public boolean isAfterDeletionEnd() { - return ! isDeletion() && atStartOfCurrentCigar() && hasOperator(getPreviousOnGenomeCigarElement(), CigarOperator.D); - } - - /** - * Get the read for this pileup element - * @return a non-null GATKSAMRecord - */ - @Ensures("result != null") - public GATKSAMRecord getRead() { - return read; - } - - /** - * Get the offset of the this element into the read that aligns that read's base to this genomic position. - * - * If the current element is a deletion then offset is the offset of the last base containing offset. - * - * @return a valid offset into the read's bases - */ - @Ensures({"result >= 0", "result <= read.getReadLength()"}) - public int getOffset() { - return offset; - } - - /** - * Get the base aligned to the genome at this location - * - * If the current element is a deletion returns DELETION_BASE - * - * @return a base encoded as a byte - */ - @Ensures("result != DELETION_BASE || (isDeletion() && result == DELETION_BASE)") - public byte getBase() { - return isDeletion() ? DELETION_BASE : read.getReadBases()[offset]; - } - - @Deprecated - public int getBaseIndex() { - return BaseUtils.simpleBaseToBaseIndex(getBase()); - } - - /** - * Get the base quality score of the base at this aligned position on the genome - * @return a phred-scaled quality score as a byte - */ - public byte getQual() { - return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; - } - - /** - * Get the Base Insertion quality at this pileup position - * @return a phred-scaled quality score as a byte - */ - public byte getBaseInsertionQual() { - return isDeletion() ? DELETION_QUAL : read.getBaseInsertionQualities()[offset]; - } - - /** - * Get the Base Deletion quality at this pileup position - * @return a phred-scaled quality score as a byte - */ - public byte getBaseDeletionQual() { - return isDeletion() ? DELETION_QUAL : read.getBaseDeletionQualities()[offset]; - } - - /** - * Get the length of an immediately following insertion or deletion event, or 0 if no such event exists - * - * Only returns a positive value when this pileup element is immediately before an indel. Being - * immediately before a deletion means that this pileup element isn't an deletion, and that the - * next genomic alignment for this read is a deletion. For the insertion case, this means - * that an insertion cigar occurs immediately after this element, between this one and the - * next genomic position. - * - * Note this function may be expensive, so multiple uses should be cached by the caller - * - * @return length of the event (number of inserted or deleted bases), or 0 - */ - @Ensures("result >= 0") - public int getLengthOfImmediatelyFollowingIndel() { - final CigarElement element = getNextIndelCigarElement(); - return element == null ? 0 : element.getLength(); - } - - /** - * Helpful function to get the immediately following cigar element, for an insertion or deletion - * - * if this state precedes a deletion (i.e., next position on genome) or insertion (immediately between - * this and the next position) returns the CigarElement corresponding to this event. Otherwise returns - * null. - * - * @return a CigarElement, or null if the next alignment state ins't an insertion or deletion. - */ - private CigarElement getNextIndelCigarElement() { - if ( isBeforeDeletionStart() ) { - final CigarElement element = getNextOnGenomeCigarElement(); - if ( element == null || element.getOperator() != CigarOperator.D ) - throw new IllegalStateException("Immediately before deletion but the next cigar element isn't a deletion " + element); - return element; - } else if ( isBeforeInsertion() ) { - final CigarElement element = getBetweenNextPosition().get(0); - if ( element.getOperator() != CigarOperator.I ) - throw new IllegalStateException("Immediately before insertion but the next cigar element isn't an insertion " + element); - return element; - } else { - return null; - } - } - - /** - * Get the bases for an insertion that immediately follows this alignment state, or null if none exists - * - * @see #getLengthOfImmediatelyFollowingIndel() for details on the meaning of immediately. - * - * If the immediately following state isn't an insertion, returns null - * - * @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. - */ - @Ensures("result == null || result.length() == getLengthOfImmediatelyFollowingIndel()") - public String getBasesOfImmediatelyFollowingInsertion() { - final CigarElement element = getNextIndelCigarElement(); - if ( element != null && element.getOperator() == CigarOperator.I ) { - final int getFrom = offset + 1; - final byte[] bases = Arrays.copyOfRange(read.getReadBases(), getFrom, getFrom + element.getLength()); - return new String(bases); - } else - return null; - } - - /** - * Get the mapping quality of the read of this element - * @return the mapping quality of the underlying SAM record - */ - public int getMappingQual() { - return read.getMappingQuality(); - } - - @Ensures("result != null") - public String toString() { - return String.format("%s @ %d = %c Q%d", getRead().getReadName(), getOffset(), (char) getBase(), getQual()); - } - - @Override - public int compareTo(final PileupElement pileupElement) { - if (offset < pileupElement.offset) - return -1; - else if (offset > pileupElement.offset) - return 1; - else if (read.getAlignmentStart() < pileupElement.read.getAlignmentStart()) - return -1; - else if (read.getAlignmentStart() > pileupElement.read.getAlignmentStart()) - return 1; - else - return 0; - } - - // -------------------------------------------------------------------------- - // - // Reduced read accessors - // - // -------------------------------------------------------------------------- - - /** - * Returns the number of elements in the pileup element. - * - * Unless this is a reduced read, the number of elements in a pileup element is one. In the event of - * this being a reduced read and a deletion, we return the average number of elements between the left - * and right elements to the deletion. We assume the deletion to be left aligned. - * - * @return the representative count - */ - public int getRepresentativeCount() { - if (read.isReducedRead()) { - if (isDeletion() && (offset + 1 >= read.getReadLength()) ) // deletion in the end of the read - throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); - - return isDeletion() - ? MathUtils.fastRound((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2.0) - : read.getReducedCount(offset); - } else { - return 1; - } - } - - /** - * Adjusts the representative count of this pileup element. - * Throws an exception if this element does not represent a reduced read. - * - * See GATKSAMRecord.adjustReducedCount() for warnings on the permanency of this operation. - * - * @param adjustmentFactor how much to adjust the representative count (can be positive or negative) - */ - public void adjustRepresentativeCount(final int adjustmentFactor) { - if ( read.isReducedRead() ) - read.adjustReducedCount(offset, adjustmentFactor); - else - throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced"); - } - - /** - * Get the cigar element aligning this element to the genome - * @return a non-null CigarElement - */ - @Ensures("result != null") - public CigarElement getCurrentCigarElement() { - return currentCigarElement; - } - - /** - * Get the offset of this cigar element in the Cigar of the current read (0-based) - * - * Suppose the cigar is 1M2D3I4D. If we are in the 1M state this function returns - * 0. If we are in 2D, the result is 1. If we are in the 4D, the result is 3. - * - * @return an offset into the read.getCigar() that brings us to the current cigar element - */ - public int getCurrentCigarOffset() { - return currentCigarOffset; - } - - /** - * Get the offset into the *current* cigar element for this alignment position - * - * We can be anywhere from offset 0 (first position) to length - 1 of the current - * cigar element aligning us to this genomic position. - * - * @return a valid offset into the current cigar element - */ - @Ensures({"result >= 0", "result < getCurrentCigarElement().getLength()"}) - public int getOffsetInCurrentCigar() { - return offsetInCurrentCigar; - } - - /** - * Get the cigar elements that occur before the current position but after the previous position on the genome - * - * For example, if we are in the 3M state of 1M2I3M state then 2I occurs before this position. - * - * Note that this function does not care where we are in the current cigar element. In the previous - * example this list of elements contains the 2I state regardless of where you are in the 3M. - * - * Note this returns the list of all elements that occur between this and the prev site, so for - * example we might have 5S10I2M and this function would return [5S, 10I]. - * - * @return a non-null list of CigarElements - */ - @Ensures("result != null") - public LinkedList getBetweenPrevPosition() { - return atStartOfCurrentCigar() ? getBetween(Direction.PREV) : EMPTY_LINKED_LIST; - } - - /** - * Get the cigar elements that occur after the current position but before the next position on the genome - * - * @see #getBetweenPrevPosition() for more details - * - * @return a non-null list of CigarElements - */ - @Ensures("result != null") - public LinkedList getBetweenNextPosition() { - return atEndOfCurrentCigar() ? getBetween(Direction.NEXT) : EMPTY_LINKED_LIST; - } - - /** for some helper functions */ - private enum Direction { PREV, NEXT } - - /** - * Helper function to get cigar elements between this and either the prev or next genomic position - * - * @param direction PREVIOUS if we want before, NEXT if we want after - * @return a non-null list of cigar elements between this and the neighboring position in direction - */ - @Ensures("result != null") - private LinkedList getBetween(final Direction direction) { - final int increment = direction == Direction.NEXT ? 1 : -1; - LinkedList elements = null; - final int nCigarElements = read.getCigarLength(); - for ( int i = currentCigarOffset + increment; i >= 0 && i < nCigarElements; i += increment) { - final CigarElement elt = read.getCigar().getCigarElement(i); - if ( ON_GENOME_OPERATORS.contains(elt.getOperator()) ) - break; - else { - // optimization: don't allocate list if not necessary - if ( elements == null ) - elements = new LinkedList(); - - if ( increment > 0 ) - // to keep the list in the right order, if we are incrementing positively add to the end - elements.add(elt); - else - // counting down => add to front - elements.addFirst(elt); - } - } - - // optimization: elements is null because nothing got added, just return the empty list - return elements == null ? EMPTY_LINKED_LIST : elements; - } - - /** - * Get the cigar element of the previous genomic aligned position - * - * For example, we might have 1M2I3M, and be sitting at the someone in the 3M. This - * function would return 1M, as the 2I isn't on the genome. Note this function skips - * all of the positions that would occur in the current element. So the result - * is always 1M regardless of whether we're in the first, second, or third position of the 3M - * cigar. - * - * @return a CigarElement, or null (indicating that no previous element exists) - */ - @Ensures("result == null || ON_GENOME_OPERATORS.contains(result.getOperator())") - public CigarElement getPreviousOnGenomeCigarElement() { - return getNeighboringOnGenomeCigarElement(Direction.PREV); - } - - /** - * Get the cigar element of the next genomic aligned position - * - * @see #getPreviousOnGenomeCigarElement() for more details - * - * @return a CigarElement, or null (indicating that no next element exists) - */ - @Ensures("result == null || ON_GENOME_OPERATORS.contains(result.getOperator())") - public CigarElement getNextOnGenomeCigarElement() { - return getNeighboringOnGenomeCigarElement(Direction.NEXT); - } - - /** - * Helper function to get the cigar element of the next or previous genomic position - * @param direction the direction to look in - * @return a CigarElement, or null if no such element exists - */ - @Ensures("result == null || ON_GENOME_OPERATORS.contains(result.getOperator())") - private CigarElement getNeighboringOnGenomeCigarElement(final Direction direction) { - final int increment = direction == Direction.NEXT ? 1 : -1; - final int nCigarElements = read.getCigarLength(); - - for ( int i = currentCigarOffset + increment; i >= 0 && i < nCigarElements; i += increment) { - final CigarElement elt = read.getCigar().getCigarElement(i); - if ( ON_GENOME_OPERATORS.contains(elt.getOperator()) ) - return elt; - } - - // getting here means that you didn't find anything - return null; - } - - /** - * Does the cigar element (which may be null) have operation toMatch? - * - * @param maybeCigarElement a CigarElement that might be null - * @param toMatch a CigarOperator we want to match against the one in maybeCigarElement - * @return true if maybeCigarElement isn't null and has operator toMatch - */ - @Requires("toMatch != null") - private boolean hasOperator(final CigarElement maybeCigarElement, final CigarOperator toMatch) { - return maybeCigarElement != null && maybeCigarElement.getOperator() == toMatch; - } - - /** - * Does an insertion occur immediately before the current position on the genome? - * - * @return true if yes, false if no - */ - public boolean isAfterInsertion() { return isAfter(getBetweenPrevPosition(), CigarOperator.I); } - - /** - * Does an insertion occur immediately after the current position on the genome? - * - * @return true if yes, false if no - */ - public boolean isBeforeInsertion() { return isBefore(getBetweenNextPosition(), CigarOperator.I); } - - /** - * Does a soft-clipping event occur immediately before the current position on the genome? - * - * @return true if yes, false if no - */ - public boolean isAfterSoftClip() { return isAfter(getBetweenPrevPosition(), CigarOperator.S); } - - /** - * Does a soft-clipping event occur immediately after the current position on the genome? - * - * @return true if yes, false if no - */ - public boolean isBeforeSoftClip() { return isBefore(getBetweenNextPosition(), CigarOperator.S); } - - /** - * Does a soft-clipping event occur immediately before or after the current position on the genome? - * - * @return true if yes, false if no - */ - public boolean isNextToSoftClip() { return isAfterSoftClip() || isBeforeSoftClip(); } - - /** - * Is the current position at the end of the current cigar? - * - * For example, if we are in element 3M, this function returns true if we are at offsetInCurrentCigar - * of 2, but not 0 or 1. - * - * @return true if we're at the end of the current cigar - */ - public boolean atEndOfCurrentCigar() { - return offsetInCurrentCigar == currentCigarElement.getLength() - 1; - } - - /** - * Is the current position at the start of the current cigar? - * - * For example, if we are in element 3M, this function returns true if we are at offsetInCurrentCigar - * of 0, but not 1 or 2. - * - * @return true if we're at the start of the current cigar - */ - public boolean atStartOfCurrentCigar() { - return offsetInCurrentCigar == 0; - } - - /** - * Is op the last element in the list of elements? - * - * @param elements the elements to examine - * @param op the op we want the last element's op to equal - * @return true if op == last(elements).op - */ - @Requires({"elements != null", "op != null"}) - private boolean isAfter(final LinkedList elements, final CigarOperator op) { - return ! elements.isEmpty() && elements.peekLast().getOperator() == op; - } - - /** - * Is op the first element in the list of elements? - * - * @param elements the elements to examine - * @param op the op we want the last element's op to equal - * @return true if op == first(elements).op - */ - @Requires({"elements != null", "op != null"}) - private boolean isBefore(final List elements, final CigarOperator op) { - return ! elements.isEmpty() && elements.get(0).getOperator() == op; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java deleted file mode 100644 index 455a6aa12..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ /dev/null @@ -1,1043 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pileup; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.BaseUtils; - -import java.util.*; - -public class ReadBackedPileupImpl implements ReadBackedPileup { - protected final GenomeLoc loc; - protected final PileupElementTracker pileupElementTracker; - - private final static int UNINITIALIZED_CACHED_INT_VALUE = -1; - - /** - * Different then number of elements due to reduced reads - */ - private int depthOfCoverage = UNINITIALIZED_CACHED_INT_VALUE; - private int nDeletions = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of deletions - private int nMQ0Reads = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of MQ0 reads - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This pileup will contain a list, in order of the reads, of the piled bases at - * reads[i] for all i in offsets. Does not make a copy of the data, so it's not safe to - * go changing the reads. - * - * @param loc The genome loc to associate reads wotj - * @param reads - * @param offsets - */ - public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets) { - this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads, offsets); - } - - - /** - * Create a new version of a read backed pileup at loc without any aligned reads - */ - public ReadBackedPileupImpl(GenomeLoc loc) { - this(loc, new UnifiedPileupElementTracker()); - } - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a - * pointer to pileup. Don't go changing the data in pileup. - */ - public ReadBackedPileupImpl(GenomeLoc loc, List pileup) { - if (loc == null) throw new ReviewedStingException("Illegal null genomeloc in ReadBackedPileup"); - if (pileup == null) throw new ReviewedStingException("Illegal null pileup in ReadBackedPileup"); - - this.loc = loc; - this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); - } - - /** - * Optimization of above constructor where all of the cached data is provided - * - * @param loc - * @param pileup - */ - @Deprecated - public ReadBackedPileupImpl(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { - this(loc, pileup); - } - - protected ReadBackedPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { - this.loc = loc; - this.pileupElementTracker = tracker; - } - - public ReadBackedPileupImpl(GenomeLoc loc, Map pileupsBySample) { - this.loc = loc; - PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for (Map.Entry pileupEntry : pileupsBySample.entrySet()) { - tracker.addElements(pileupEntry.getKey(), pileupEntry.getValue().pileupElementTracker); - } - this.pileupElementTracker = tracker; - } - - public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset) { - this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads, offset); - } - - /** - * Helper routine for converting reads and offset lists to a PileupElement list. - * - * @param reads - * @param offsets - * @return - */ - private PileupElementTracker readsOffsets2Pileup(List reads, List offsets) { - if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); - if (offsets == null) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); - if (reads.size() != offsets.size()) - throw new ReviewedStingException("Reads and offset lists have different sizes!"); - - UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for (int i = 0; i < reads.size(); i++) { - GATKSAMRecord read = reads.get(i); - int offset = offsets.get(i); - pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important - } - - return pileup; - } - - /** - * Helper routine for converting reads and a single offset to a PileupElement list. - * - * @param reads - * @param offset - * @return - */ - private PileupElementTracker readsOffsets2Pileup(List reads, int offset) { - if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); - if (offset < 0) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); - - UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for (GATKSAMRecord read : reads) { - pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important - } - - return pileup; - } - - protected ReadBackedPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { - return new ReadBackedPileupImpl(loc, tracker); - } - - protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset) { - return LocusIteratorByState.createPileupForReadAndOffset(read, offset); - } - - // -------------------------------------------------------- - // - // Special 'constructors' - // - // -------------------------------------------------------- - - /** - * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this - * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy - * of the pileup (just returns this) if there are no deletions in the pileup. - * - * @return - */ - @Override - public ReadBackedPileupImpl getPileupWithoutDeletions() { - if (getNumberOfDeletions() > 0) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutDeletions(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (!p.isDeletion()) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } else { - return this; - } - } - - /** - * Returns a new ReadBackedPileup where only one read from an overlapping read - * pair is retained. If the two reads in question disagree to their basecall, - * neither read is retained. If they agree on the base, the read with the higher - * base quality observation is retained - * - * @return the newly filtered pileup - */ - @Override - public ReadBackedPileup getOverlappingFragmentFilteredPileup() { - return getOverlappingFragmentFilteredPileup(true, true); - } - - /** - * Returns a new ReadBackedPileup where only one read from an overlapping read - * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, - * neither read is retained. Otherwise, the read with the higher - * quality (base or mapping, depending on baseQualNotMapQual) observation is retained - * - * @return the newly filtered pileup - */ - @Override - public ReadBackedPileupImpl getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - } else { - Map filteredPileup = new HashMap(); - - for (PileupElement p : pileupElementTracker) { - String readName = p.getRead().getReadName(); - - // if we've never seen this read before, life is good - if (!filteredPileup.containsKey(readName)) { - filteredPileup.put(readName, p); - } else { - PileupElement existing = filteredPileup.get(readName); - - // if the reads disagree at this position, throw them both out. Otherwise - // keep the element with the higher quality score - if (discardDiscordant && existing.getBase() != p.getBase()) { - filteredPileup.remove(readName); - } else { - if (baseQualNotMapQual) { - if (existing.getQual() < p.getQual()) - filteredPileup.put(readName, p); - } - else { - if (existing.getMappingQual() < p.getMappingQual()) - filteredPileup.put(readName, p); - } - } - } - } - - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement filteredElement : filteredPileup.values()) - filteredTracker.add(filteredElement); - - return createNewPileup(loc, filteredTracker); - } - } - - - /** - * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this - * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy - * of the pileup (just returns this) if there are no MQ0 reads in the pileup. - * - * @return - */ - @Override - public ReadBackedPileupImpl getPileupWithoutMappingQualityZeroReads() { - if (getNumberOfMappingQualityZeroReads() > 0) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutMappingQualityZeroReads(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (p.getRead().getMappingQuality() > 0) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } else { - return this; - } - } - - public ReadBackedPileupImpl getPositiveStrandPileup() { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPositiveStrandPileup(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (!p.getRead().getReadNegativeStrandFlag()) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Gets the pileup consisting of only reads on the negative strand. - * - * @return A read-backed pileup consisting only of reads on the negative strand. - */ - public ReadBackedPileupImpl getNegativeStrandPileup() { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getNegativeStrandPileup(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (p.getRead().getReadNegativeStrandFlag()) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Gets a pileup consisting of all those elements passed by a given filter. - * - * @param filter Filter to use when testing for elements. - * @return a pileup without the given filtered elements. - */ - public ReadBackedPileupImpl getFilteredPileup(PileupElementFilter filter) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getFilteredPileup(filter); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : pileupElementTracker) { - if (filter.allow(p)) - filteredTracker.add(p); - } - - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from - * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. - * - * @param minBaseQ - * @param minMapQ - * @return - */ - @Override - public ReadBackedPileupImpl getBaseAndMappingFilteredPileup(int minBaseQ, int minMapQ) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ, minMapQ); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : pileupElementTracker) { - if (p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ)) { - filteredTracker.add(p); - } - } - - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Returns subset of this pileup that contains only bases with quality >= minBaseQ. - * This method allocates and returns a new instance of ReadBackedPileup. - * - * @param minBaseQ - * @return - */ - @Override - public ReadBackedPileup getBaseFilteredPileup(int minBaseQ) { - return getBaseAndMappingFilteredPileup(minBaseQ, -1); - } - - /** - * Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. - * This method allocates and returns a new instance of ReadBackedPileup. - * - * @param minMapQ - * @return - */ - @Override - public ReadBackedPileup getMappingFilteredPileup(int minMapQ) { - return getBaseAndMappingFilteredPileup(-1, minMapQ); - } - - /** - * Gets a list of the read groups represented in this pileup. - * - * @return - */ - @Override - public Collection getReadGroups() { - Set readGroups = new HashSet(); - for (PileupElement pileupElement : this) - readGroups.add(pileupElement.getRead().getReadGroup().getReadGroupId()); - return readGroups; - } - - /** - * Gets the pileup for a given read group. Horrendously inefficient at this point. - * - * @param targetReadGroupId Identifier for the read group. - * @return A read-backed pileup containing only the reads in the given read group. - */ - @Override - public ReadBackedPileupImpl getPileupForReadGroup(String targetReadGroupId) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroup(targetReadGroupId); - if (pileup != null) - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (targetReadGroupId != null) { - if (read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - /** - * Gets the pileup for a set of read groups. Horrendously inefficient at this point. - * - * @param rgSet List of identifiers for the read groups. - * @return A read-backed pileup containing only the reads in the given read groups. - */ - @Override - public ReadBackedPileupImpl getPileupForReadGroups(final HashSet rgSet) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroups(rgSet); - if (pileup != null) - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (rgSet != null && !rgSet.isEmpty()) { - if (read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - @Override - public ReadBackedPileupImpl getPileupForLane(String laneID) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForLane(laneID); - if (pileup != null) - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (laneID != null) { - if (read.getReadGroup() != null && - (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different - (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - public Collection getSamples() { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - return new HashSet(tracker.getSamples()); - } else { - Collection sampleNames = new HashSet(); - for (PileupElement p : this) { - GATKSAMRecord read = p.getRead(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - sampleNames.add(sampleName); - } - return sampleNames; - } - } - - /** - * Returns a pileup randomly downsampled to the desiredCoverage. - * - * TODO: delete this once the experimental downsampler stabilizes - * - * @param desiredCoverage - * @return - */ - @Override - public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { - if (getNumberOfElements() <= desiredCoverage) - return this; - - // randomly choose numbers corresponding to positions in the reads list - TreeSet positions = new TreeSet(); - for (int i = 0; i < desiredCoverage; /* no update */) { - if (positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(getNumberOfElements()))) - i++; - } - - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - - int current = 0; - UnifiedPileupElementTracker filteredPileup = new UnifiedPileupElementTracker(); - for (PileupElement p : perSampleElements) { - if (positions.contains(current)) - filteredPileup.add(p); - current++; - - } - filteredTracker.addElements(sample, filteredPileup); - } - - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - Iterator positionIter = positions.iterator(); - - while (positionIter.hasNext()) { - int nextReadToKeep = (Integer) positionIter.next(); - filteredTracker.add(tracker.get(nextReadToKeep)); - } - - return createNewPileup(getLocation(), filteredTracker); - } - } - - @Override - public ReadBackedPileup getPileupForSamples(Collection sampleNames) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sampleNames); - return filteredElements != null ? createNewPileup(loc, filteredElements) : null; - } else { - HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. - if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - @Override - public Map getPileupsForSamples(Collection sampleNames) { - Map result = new HashMap(); - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - for (String sample : sampleNames) { - PileupElementTracker filteredElements = tracker.getElements(sample); - if (filteredElements != null) - result.put(sample, createNewPileup(loc, filteredElements)); - } - } else { - Map> trackerMap = new HashMap>(); - - for (String sample : sampleNames) { // initialize pileups for each sample - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - trackerMap.put(sample, filteredTracker); - } - for (PileupElement p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup - GATKSAMRecord read = p.getRead(); - if (read.getReadGroup() != null) { - String sample = read.getReadGroup().getSample(); - UnifiedPileupElementTracker tracker = trackerMap.get(sample); - if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest - tracker.add(p); - } - } - for (Map.Entry> entry : trackerMap.entrySet()) // create the ReadBackedPileup for each sample - result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); - } - return result; - } - - - @Override - public ReadBackedPileup getPileupForSample(String sampleName) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sampleName); - return filteredElements != null ? createNewPileup(loc, filteredElements) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (sampleName != null) { - if (read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - // -------------------------------------------------------- - // - // iterators - // - // -------------------------------------------------------- - - /** - * The best way to access PileupElements where you only care about the bases and quals in the pileup. - *

- * for (PileupElement p : this) { doSomething(p); } - *

- * Provides efficient iteration of the data. - * - * @return - */ - @Override - public Iterator iterator() { - return new Iterator() { - private final Iterator wrappedIterator = pileupElementTracker.iterator(); - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public PileupElement next() { - return wrappedIterator.next(); - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); - } - }; - } - - /** - * The best way to access PileupElements where you only care not only about bases and quals in the pileup - * but also need access to the index of the pileup element in the pile. - * - * for (ExtendedPileupElement p : this) { doSomething(p); } - * - * Provides efficient iteration of the data. - * - * @return - */ - - /** - * Simple useful routine to count the number of deletion bases in this pileup - * - * @return - */ - @Override - public int getNumberOfDeletions() { - if ( nDeletions == UNINITIALIZED_CACHED_INT_VALUE ) { - nDeletions = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable() ) { - if (p.isDeletion()) { - nDeletions++; - } - } - } - return nDeletions; - } - - @Override - public int getNumberOfMappingQualityZeroReads() { - if ( nMQ0Reads == UNINITIALIZED_CACHED_INT_VALUE ) { - nMQ0Reads = 0; - - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - if (p.getRead().getMappingQuality() == 0) { - nMQ0Reads++; - } - } - } - - return nMQ0Reads; - } - - /** - * @return the number of physical elements in this pileup - */ - @Override - public int getNumberOfElements() { - return pileupElementTracker.size(); - } - - /** - * @return the number of abstract elements in this pileup - */ - @Override - public int depthOfCoverage() { - if (depthOfCoverage == UNINITIALIZED_CACHED_INT_VALUE) { - depthOfCoverage = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - depthOfCoverage += p.getRepresentativeCount(); - } - } - return depthOfCoverage; - } - - /** - * @return true if there are 0 elements in the pileup, false otherwise - */ - @Override - public boolean isEmpty() { - return getNumberOfElements() == 0; - } - - - /** - * @return the location of this pileup - */ - @Override - public GenomeLoc getLocation() { - return loc; - } - - /** - * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according - * to BaseUtils.simpleBaseToBaseIndex for each base. - * - * @return - */ - @Override - public int[] getBaseCounts() { - int[] counts = new int[4]; - - // TODO -- can be optimized with .unorderedIterable() - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - for (final String sample : tracker.getSamples()) { - int[] countsBySample = createNewPileup(loc, tracker.getElements(sample)).getBaseCounts(); - for (int i = 0; i < counts.length; i++) - counts[i] += countsBySample[i]; - } - } else { - for (PileupElement pile : this) { - // skip deletion sites - if (!pile.isDeletion()) { - int index = BaseUtils.simpleBaseToBaseIndex((char) pile.getBase()); - if (index != -1) - counts[index]++; - } - } - } - - return counts; - } - - @Override - public String getPileupString(Character ref) { - // In the pileup format, each line represents a genomic position, consisting of chromosome name, - // coordinate, reference base, read bases, read qualities and alignment mapping qualities. - return String.format("%s %s %c %s %s", - getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate - ref, // reference base - new String(getBases()), - getQualsString()); - } - - // -------------------------------------------------------- - // - // Convenience functions that may be slow - // - // -------------------------------------------------------- - - /** - * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time - * - * @return - */ - @Override - public List getReads() { - List reads = new ArrayList(getNumberOfElements()); - for (PileupElement pile : this) { - reads.add(pile.getRead()); - } - return reads; - } - - @Override - public int getNumberOfDeletionsAfterThisElement() { - int count = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - if (p.isBeforeDeletionStart()) - count++; - } - return count; - } - - @Override - public int getNumberOfInsertionsAfterThisElement() { - int count = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - if (p.isBeforeInsertion()) - count++; - } - return count; - - } - /** - * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time - * - * @return - */ - @Override - public List getOffsets() { - List offsets = new ArrayList(getNumberOfElements()); - for (PileupElement pile : pileupElementTracker.unorderedIterable()) { - offsets.add(pile.getOffset()); - } - return offsets; - } - - /** - * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time - * - * @return - */ - @Override - public byte[] getBases() { - byte[] v = new byte[getNumberOfElements()]; - int pos = 0; - for (PileupElement pile : pileupElementTracker) { - v[pos++] = pile.getBase(); - } - return v; - } - - /** - * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time - * - * @return - */ - @Override - public byte[] getQuals() { - byte[] v = new byte[getNumberOfElements()]; - int pos = 0; - for (PileupElement pile : pileupElementTracker) { - v[pos++] = pile.getQual(); - } - return v; - } - - /** - * Get an array of the mapping qualities - * - * @return - */ - @Override - public int[] getMappingQuals() { - final int[] v = new int[getNumberOfElements()]; - int pos = 0; - for ( final PileupElement pile : pileupElementTracker ) { - v[pos++] = pile.getRead().getMappingQuality(); - } - return v; - } - - static String quals2String(byte[] quals) { - StringBuilder qualStr = new StringBuilder(); - for (int qual : quals) { - qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea - char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 - qualStr.append(qualChar); - } - - return qualStr.toString(); - } - - private String getQualsString() { - return quals2String(getQuals()); - } - - /** - * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. - * - * @return - */ - @Override - public ReadBackedPileup getStartSortedPileup() { - - final TreeSet sortedElements = new TreeSet(new Comparator() { - @Override - public int compare(PileupElement element1, PileupElement element2) { - final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); - return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); - } - }); - - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - for (PileupElement pile : perSampleElements) - sortedElements.add(pile); - } - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - for (PileupElement pile : tracker) - sortedElements.add(pile); - } - - UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); - for (PileupElement pile : sortedElements) - sortedTracker.add(pile); - - return createNewPileup(loc, sortedTracker); - } - - @Override - public FragmentCollection toFragments() { - return FragmentUtils.create(this); - } - - @Override - public ReadBackedPileup copy() { - return new ReadBackedPileupImpl(loc, pileupElementTracker.copy()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java deleted file mode 100644 index 055f8630b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ /dev/null @@ -1,509 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; - -import java.io.File; -import java.util.*; - -/** - * @author aaron - * @version 1.0 - */ -public class ArtificialSAMUtils { - public static final int DEFAULT_READ_LENGTH = 50; - - /** - * create an artificial sam file - * - * @param filename the filename to write to - * @param numberOfChromosomes the number of chromosomes - * @param startingChromosome where to start counting - * @param chromosomeSize how large each chromosome is - * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) - */ - public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { - SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); - File outFile = new File(filename); - - SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); - - for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { - out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); - } - } - - out.close(); - } - - /** - * create an artificial sam file - * - * @param filename the filename to write to - * @param numberOfChromosomes the number of chromosomes - * @param startingChromosome where to start counting - * @param chromosomeSize how large each chromosome is - * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) - */ - public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { - SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); - File outFile = new File(filename); - - SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile); - - for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) { - out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100)); - } - } - - out.close(); - } - - /** - * Creates an artificial sam header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc - * - * @param numberOfChromosomes the number of chromosomes to create - * @param startingChromosome the starting number for the chromosome (most likely set to 1) - * @param chromosomeSize the length of each chromosome - * @return - */ - public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { - SAMFileHeader header = new SAMFileHeader(); - header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - // make up some sequence records - for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); - rec.setSequenceLength(chromosomeSize); - dict.addSequence(rec); - } - header.setSequenceDictionary(dict); - return header; - } - - /** - * Creates an artificial sam header based on the sequence dictionary dict - * - * @return a new sam header - */ - public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { - SAMFileHeader header = new SAMFileHeader(); - header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); - header.setSequenceDictionary(dict); - return header; - } - - /** - * Creates an artificial sam header with standard test parameters - * - * @return the sam header - */ - public static SAMFileHeader createArtificialSamHeader() { - return createArtificialSamHeader(1, 1, 1000000); - } - - /** - * setup a default read group for a SAMFileHeader - * - * @param header the header to set - * @param readGroupID the read group ID tag - * @param sampleName the sample name - * @return the adjusted SAMFileHeader - */ - public static SAMFileHeader createDefaultReadGroup(SAMFileHeader header, String readGroupID, String sampleName) { - SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID); - rec.setSample(sampleName); - List readGroups = new ArrayList(); - readGroups.add(rec); - header.setReadGroups(readGroups); - return header; - } - - /** - * setup read groups for the specified read groups and sample names - * - * @param header the header to set - * @param readGroupIDs the read group ID tags - * @param sampleNames the sample names - * @return the adjusted SAMFileHeader - */ - public static SAMFileHeader createEnumeratedReadGroups(SAMFileHeader header, List readGroupIDs, List sampleNames) { - if (readGroupIDs.size() != sampleNames.size()) { - throw new ReviewedStingException("read group count and sample name count must be the same"); - } - - List readGroups = new ArrayList(); - - int x = 0; - for (; x < readGroupIDs.size(); x++) { - SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x)); - rec.setSample(sampleNames.get(x)); - readGroups.add(rec); - } - header.setReadGroups(readGroups); - return header; - } - - - /** - * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param length the length of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { - if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || - (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) - throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); - GATKSAMRecord record = new GATKSAMRecord(header); - record.setReadName(name); - record.setReferenceIndex(refIndex); - record.setAlignmentStart(alignmentStart); - List elements = new ArrayList(); - elements.add(new CigarElement(length, CigarOperator.characterToEnum('M'))); - record.setCigar(new Cigar(elements)); - record.setProperPairFlag(false); - - // our reads and quals are all 'A's by default - byte[] c = new byte[length]; - byte[] q = new byte[length]; - for (int x = 0; x < length; x++) - c[x] = q[x] = 'A'; - record.setReadBases(c); - record.setBaseQualities(q); - - if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - record.setReadUnmappedFlag(true); - } - - return record; - } - - /** - * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param bases the sequence of the read - * @param qual the qualities of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { - if (bases.length != qual.length) { - throw new ReviewedStingException("Passed in read string is different length then the quality array"); - } - GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); - rec.setReadBases(bases); - rec.setBaseQualities(qual); - rec.setReadGroup(new GATKSAMReadGroupRecord("x")); - if (refIndex == -1) { - rec.setReadUnmappedFlag(true); - } - - return rec; - } - - /** - * Create an artificial read based on the parameters - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param bases the sequence of the read - * @param qual the qualities of the read - * @param cigar the cigar string of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { - GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); - rec.setCigarString(cigar); - return rec; - } - - /** - * Create an artificial read with the following default parameters : - * header: - * numberOfChromosomes = 1 - * startingChromosome = 1 - * chromosomeSize = 1000000 - * read: - * name = "default_read" - * refIndex = 0 - * alignmentStart = 1 - * - * @param bases the sequence of the read - * @param qual the qualities of the read - * @param cigar the cigar string of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(byte[] bases, byte[] qual, String cigar) { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); - } - - public static GATKSAMRecord createArtificialRead(Cigar cigar) { - int length = cigar.getReadLength(); - byte [] base = {'A'}; - byte [] qual = {30}; - byte [] bases = Utils.arrayFromArrayWithLength(base, length); - byte [] quals = Utils.arrayFromArrayWithLength(qual, length); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString()); - } - - - public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { - GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); - GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); - - left.setReadPairedFlag(true); - right.setReadPairedFlag(true); - - left.setProperPairFlag(true); - right.setProperPairFlag(true); - - left.setFirstOfPairFlag(leftIsFirst); - right.setFirstOfPairFlag(!leftIsFirst); - - left.setReadNegativeStrandFlag(leftIsNegative); - left.setMateNegativeStrandFlag(!leftIsNegative); - right.setReadNegativeStrandFlag(!leftIsNegative); - right.setMateNegativeStrandFlag(leftIsNegative); - - left.setMateAlignmentStart(right.getAlignmentStart()); - right.setMateAlignmentStart(left.getAlignmentStart()); - - left.setMateReferenceIndex(0); - right.setMateReferenceIndex(0); - - int isize = rightStart + readLen - leftStart; - left.setInferredInsertSize(isize); - right.setInferredInsertSize(-isize); - - return Arrays.asList(left, right); - } - - /** - * Create an artificial reduced read based on the parameters. The cigar string will be *M, where * is the - * length of the read. The base counts specified in the baseCounts array will be stored fully encoded in - * the RR attribute. - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param length the length of the read - * @param baseCounts reduced base counts to encode in the RR attribute; length must match the read length - * @return the artificial reduced read - */ - public static GATKSAMRecord createArtificialReducedRead( final SAMFileHeader header, - final String name, - final int refIndex, - final int alignmentStart, - final int length, - final int[] baseCounts ) { - final GATKSAMRecord read = createArtificialRead(header, name, refIndex, alignmentStart, length); - read.setReducedReadCounts(baseCounts); - read.setReducedReadCountsTag(); - return read; - } - - /** - * Create a collection of identical artificial reads based on the parameters. The cigar string for each - * read will be *M, where * is the length of the read. - * - * Useful for testing things like positional downsampling where you care only about the position and - * number of reads, and not the other attributes. - * - * @param stackSize number of identical reads to create - * @param header the SAM header to associate each read with - * @param name name associated with each read - * @param refIndex the reference index, i.e. what chromosome to associate them with - * @param alignmentStart where to start each alignment - * @param length the length of each read - * - * @return a collection of stackSize reads all sharing the above properties - */ - public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - Collection stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; - } - - /** - * create an iterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @return StingSAMIterator representing the specified amount of fake data - */ - public static StingSAMIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); - } - - /** - * create an iterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * @return StingSAMIterator representing the specified amount of fake data - */ - public static StingSAMIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); - } - - /** - * create an ArtificialSAMQueryIterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @return StingSAMIterator representing the specified amount of fake data - */ - public static ArtificialSAMQueryIterator queryReadIterator(int startingChr, int endingChr, int readCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); - } - - /** - * create an ArtificialSAMQueryIterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * @return StingSAMIterator representing the specified amount of fake data - */ - public static StingSAMIterator queryReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); - } - - /** - * Create an iterator containing the specified reads - * - * @param reads the reads - * @return iterator for the reads - */ - public static StingSAMIterator createReadIterator(SAMRecord... reads) { - return createReadIterator(Arrays.asList(reads)); - } - - /** - * Create an iterator containing the specified reads - * - * @param reads the reads - * @return iterator for the reads - */ - public static StingSAMIterator createReadIterator(List reads) { - final Iterator iter = reads.iterator(); - return new StingSAMIterator() { - @Override public void close() {} - @Override public Iterator iterator() { return iter; } - @Override public boolean hasNext() { return iter.hasNext(); } - @Override public SAMRecord next() { return iter.next(); } - @Override public void remove() { iter.remove(); } - }; - } - - private final static int ranIntInclusive(Random ran, int start, int stop) { - final int range = stop - start; - return ran.nextInt(range) + start; - } - - /** - * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with - * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert - * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second - * may be, depending on where this sampled insertSize puts it. - * - * @param header - * @param loc - * @param readLen - * @param insertSize - * @param pileupSize - * @return - */ - public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { - final Random ran = new Random(); - final boolean leftIsFirst = true; - final boolean leftIsNegative = false; - final int insertSizeVariation = insertSize / 10; - final int pos = loc.getStart(); - - final List pileupElements = new ArrayList(); - for (int i = 0; i < pileupSize / 2; i++) { - final String readName = "read" + i; - final int leftStart = ranIntInclusive(ran, 1, pos); - final int fragmentSize = (int) (ran.nextGaussian() * insertSizeVariation + insertSize); - final int rightStart = leftStart + fragmentSize - readLen; - - if (rightStart <= 0) continue; - - List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); - final GATKSAMRecord left = pair.get(0); - final GATKSAMRecord right = pair.get(1); - - pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(left, pos - leftStart)); - - if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { - pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(right, pos - rightStart)); - } - } - - Collections.sort(pileupElements); - return new ReadBackedPileupImpl(loc, pileupElements); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java deleted file mode 100644 index 93718b04d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ /dev/null @@ -1,827 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.sam; - -import com.google.java.contract.Ensures; -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.NGSPlatform; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.recalibration.EventType; - -import java.util.*; - -/** - * @author ebanks, depristo - * GATKSAMRecord - * - * this class extends the samtools BAMRecord class (and SAMRecord) and caches important - * (and oft-accessed) data that's not already cached by the SAMRecord class - * - * IMPORTANT NOTE: Because ReadGroups are not set through the SAMRecord, - * if they are ever modified externally then one must also invoke the - * setReadGroup() method here to ensure that the cache is kept up-to-date. - * - * WARNING -- GATKSAMRecords cache several values (that are expensive to compute) - * that depending on the inferred insert size and alignment starts and stops of this read and its mate. - * Changing these values in any way will invalidate the cached value. However, we do not monitor those setter - * functions, so modifying a GATKSAMRecord in any way may result in stale cached values. - */ -public class GATKSAMRecord extends BAMRecord { - // ReduceReads specific attribute tags - public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool - public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool - public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start - public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end - - // Base Quality Score Recalibrator specific attribute tags - public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions - public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions - - /** - * The default quality score for an insertion or deletion, if - * none are provided for this read. - */ - public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45; - - // the SAMRecord data we're caching - private String mReadString = null; - private GATKSAMReadGroupRecord mReadGroup = null; - private int[] reducedReadCounts = null; - private final static int UNINITIALIZED = -1; - private int softStart = UNINITIALIZED; - private int softEnd = UNINITIALIZED; - private Integer adapterBoundary = null; - - private Boolean isStrandlessRead = null; - - // because some values can be null, we don't want to duplicate effort - private boolean retrievedReadGroup = false; - private boolean retrievedReduceReadCounts = false; - - // These temporary attributes were added here to make life easier for - // certain algorithms by providing a way to label or attach arbitrary data to - // individual GATKSAMRecords. - // These attributes exist in memory only, and are never written to disk. - private Map temporaryAttributes; - - /** - * HACK TO CREATE GATKSAMRECORD WITH ONLY A HEADER FOR TESTING PURPOSES ONLY - * @param header - */ - public GATKSAMRecord(final SAMFileHeader header) { - this(new SAMRecord(header)); - } - - /** - * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY - * @param read - */ - public GATKSAMRecord(final SAMRecord read) { - super(read.getHeader(), read.getMateReferenceIndex(), - read.getAlignmentStart(), - read.getReadName() != null ? (short)read.getReadNameLength() : 0, - (short)read.getMappingQuality(), - 0, - read.getCigarLength(), - read.getFlags(), - read.getReadLength(), - read.getMateReferenceIndex(), - read.getMateAlignmentStart(), - read.getInferredInsertSize(), - null); - SAMReadGroupRecord samRG = read.getReadGroup(); - clearAttributes(); - if (samRG != null) { - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); - setReadGroup(rg); - } - } - - public GATKSAMRecord(final SAMFileHeader header, - final int referenceSequenceIndex, - final int alignmentStart, - final short readNameLength, - final short mappingQuality, - final int indexingBin, - final int cigarLen, - final int flags, - final int readLen, - final int mateReferenceSequenceIndex, - final int mateAlignmentStart, - final int insertSize, - final byte[] variableLengthBlock) { - super(header, referenceSequenceIndex, alignmentStart, readNameLength, mappingQuality, indexingBin, cigarLen, - flags, readLen, mateReferenceSequenceIndex, mateAlignmentStart, insertSize, variableLengthBlock); - } - - public static GATKSAMRecord createRandomRead(int length) { - List cigarElements = new LinkedList<>(); - cigarElements.add(new CigarElement(length, CigarOperator.M)); - Cigar cigar = new Cigar(cigarElements); - return ArtificialSAMUtils.createArtificialRead(cigar); - } - - /////////////////////////////////////////////////////////////////////////////// - // *** support for reads without meaningful strand information ***// - /////////////////////////////////////////////////////////////////////////////// - - /** - * Does this read have a meaningful strandedness value? - * - * Some advanced types of reads, such as reads coming from merged fragments, - * don't have meaningful strandedness values, as they are composites of multiple - * other reads. Strandless reads need to be handled specially by code that cares about - * stranded information, such as FS. - * - * @return true if this read doesn't have meaningful strand information - */ - public boolean isStrandless() { - if ( isStrandlessRead == null ) { - isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null; - } - return isStrandlessRead; - } - - /** - * Set the strandless state of this read to isStrandless - * @param isStrandless true if this read doesn't have a meaningful strandedness value - */ - public void setIsStrandless(final boolean isStrandless) { - this.isStrandlessRead = isStrandless; - } - - @Override - public boolean getReadNegativeStrandFlag() { - return ! isStrandless() && super.getReadNegativeStrandFlag(); - } - - @Override - public void setReadNegativeStrandFlag(final boolean flag) { - if ( isStrandless() ) - throw new IllegalStateException("Cannot set the strand of a strandless read"); - super.setReadNegativeStrandFlag(flag); - } - - - /////////////////////////////////////////////////////////////////////////////// - // *** The following methods are overloaded to cache the appropriate data ***// - /////////////////////////////////////////////////////////////////////////////// - - @Override - public String getReadString() { - if ( mReadString == null ) - mReadString = super.getReadString(); - return mReadString; - } - - @Override - public void setReadString(String s) { - super.setReadString(s); - mReadString = s; - } - - /** - * Get the GATKSAMReadGroupRecord of this read - * @return a non-null GATKSAMReadGroupRecord - */ - @Override - public GATKSAMReadGroupRecord getReadGroup() { - if ( ! retrievedReadGroup ) { - final SAMReadGroupRecord rg = super.getReadGroup(); - - // three cases: rg may be null (no rg, rg may already be a GATKSAMReadGroupRecord, or it may be - // a regular SAMReadGroupRecord in which case we have to make it a GATKSAMReadGroupRecord - if ( rg == null ) - mReadGroup = null; - else if ( rg instanceof GATKSAMReadGroupRecord ) - mReadGroup = (GATKSAMReadGroupRecord)rg; - else - mReadGroup = new GATKSAMReadGroupRecord(rg); - - retrievedReadGroup = true; - } - return mReadGroup; - } - - public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { - mReadGroup = readGroup; - retrievedReadGroup = true; - setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! - } - - - @Override - public int hashCode() { - return super.hashCode(); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - - if (!(o instanceof GATKSAMRecord)) return false; - - // note that we do not consider the GATKSAMRecord internal state at all - return super.equals(o); - } - - /** - * Setters and Accessors for base insertion and base deletion quality scores - */ - public void setBaseQualities( final byte[] quals, final EventType errorModel ) { - switch( errorModel ) { - case BASE_SUBSTITUTION: - setBaseQualities(quals); - break; - case BASE_INSERTION: - setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals) ); - break; - case BASE_DELETION: - setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals) ); - break; - default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); - } - } - - public byte[] getBaseQualities( final EventType errorModel ) { - switch( errorModel ) { - case BASE_SUBSTITUTION: - return getBaseQualities(); - case BASE_INSERTION: - return getBaseInsertionQualities(); - case BASE_DELETION: - return getBaseDeletionQualities(); - default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); - } - } - - /** - * @return whether or not this read has base insertion or deletion qualities (one of the two is sufficient to return true) - */ - public boolean hasBaseIndelQualities() { - return getAttribute( BQSR_BASE_INSERTION_QUALITIES ) != null || getAttribute( BQSR_BASE_DELETION_QUALITIES ) != null; - } - - /** - * @return the base deletion quality or null if read doesn't have one - */ - public byte[] getExistingBaseInsertionQualities() { - return SAMUtils.fastqToPhred( getStringAttribute(BQSR_BASE_INSERTION_QUALITIES)); - } - - /** - * @return the base deletion quality or null if read doesn't have one - */ - public byte[] getExistingBaseDeletionQualities() { - return SAMUtils.fastqToPhred( getStringAttribute(BQSR_BASE_DELETION_QUALITIES)); - } - - /** - * Default utility to query the base insertion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) - * and assigns it to the read. - * - * @return the base insertion quality array - */ - public byte[] getBaseInsertionQualities() { - byte [] quals = getExistingBaseInsertionQualities(); - if( quals == null ) { - quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - } - return quals; - } - - /** - * Default utility to query the base deletion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) - * and assigns it to the read. - * - * @return the base deletion quality array - */ - public byte[] getBaseDeletionQualities() { - byte[] quals = getExistingBaseDeletionQualities(); - if( quals == null ) { - quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - } - return quals; - } - - /** - * Efficient caching accessor that returns the GATK NGSPlatform of this read - * @return - */ - public NGSPlatform getNGSPlatform() { - return getReadGroup().getNGSPlatform(); - } - - /////////////////////////////////////////////////////////////////////////////// - // *** ReduceReads functions ***// - /////////////////////////////////////////////////////////////////////////////// - - /** - * Get the counts of the bases in this reduced read - * - * NOTE that this is not the value of the REDUCED_READ_CONSENSUS_TAG, which - * is encoded in a special way. This is the actual positive counts of the - * depth at each bases. So for a RR with a tag of: - * - * [10, 5, -1, -5] - * - * this function returns - * - * [10, 15, 9, 5] - * - * as one might expect. - * - * @return a int[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read - */ - public int[] getReducedReadCounts() { - if ( ! retrievedReduceReadCounts ) { - final byte[] tag = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); - if ( tag != null ) reducedReadCounts = decodeReduceReadCounts(tag); - retrievedReduceReadCounts = true; - } - - return reducedReadCounts; - } - - /** - * The number of bases corresponding the i'th base of the reduced read. - * - * @param i the read based coordinate inside the read - * @return the number of bases corresponding to the i'th base of the reduced read - */ - public final int getReducedCount(final int i) { - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); - if ( i < 0 || i >= getReadBases().length ) - throw new IllegalArgumentException("illegal offset used when retrieving reduced counts: " + i); - - final int[] reducedCounts = getReducedReadCounts(); - return reducedCounts[i]; - } - - /** - * Is this read a reduced read? - * @return true if yes - */ - public boolean isReducedRead() { - return getReducedReadCounts() != null; - } - - /** - * Set the reduced read counts tag for this record. - * Note that this method is slightly expensive as it converts to the correct reduced counts representation and sets the - * appropriate binary tag. If you want to modify the reduced count in place without triggering the permanent conversion - * internally, use the #setReducedCount() method. - * - * @param counts the count array - */ - public void setReducedReadCountsTag(final int[] counts) { - setAttribute(REDUCED_READ_CONSENSUS_TAG, encodeReduceReadCounts(counts)); - retrievedReduceReadCounts = false; // need to force new decode in case we had to handle precision problems with the counts - } - - /** - * @see #setReducedReadCountsTag() and uses the currently stored values of the internal array. - * Useful if you've been using #setReducedCount() to modify the reduced count and now want to trigger the expensive conversion. - */ - public void setReducedReadCountsTag() { - if ( !retrievedReduceReadCounts ) - throw new IllegalStateException("Trying to write the reduced reads counts using an uninitialized internal array of counts"); - setReducedReadCountsTag(reducedReadCounts); - } - - /** - * Sets the reduced read count corresponding the i'th base of the reduced read. - * - * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion - * and push that value into the read's binary tags, use #setReducedReadCountsTag(). - * - * @param i the read based coordinate inside the read - * @param count the new count - */ - public final void setReducedCount(final int i, final int count) { - if ( count < 0 ) - throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); - if ( i < 0 || i >= getReadBases().length ) - throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); - - // force the initialization of the counts array if it hasn't happened yet - getReducedReadCounts()[i] = count; - } - - /** - * Set the reduced read counts tag for this record to counts - * - * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion - * and push that value into the read's binary tags, use #setReducedReadCountsTag(). - * - * @param counts the count array - */ - public void setReducedReadCounts(final int[] counts) { - if ( counts.length != getReadBases().length ) - throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length); - retrievedReduceReadCounts = true; - reducedReadCounts = counts; - } - - /** - * Sets the number of bases corresponding the i'th base of the reduced read. - * - * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion - * and push that value into the read's binary tags, use #setReducedReadCountsTag(). - * - * @param i the read based coordinate inside the read - * @param adjustmentFactor how much to add/subtract to the current count - */ - public final void adjustReducedCount(final int i, final int adjustmentFactor) { - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); - if ( i < 0 || i >= getReadBases().length ) - throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); - - setReducedCount(i, getReducedReadCounts()[i] + adjustmentFactor); - } - - /** - * Actually decode the consensus tag of a reduce read, returning a newly allocated - * set of values countsFromTag to be the real depth of cover at each base of the reduced read. - * - * for example, if the tag contains [10, 5, -1, -5], after running this function the - * byte[] will contain the true counts [10, 15, 9, 5]. - * - * as one might expect. - * - * @param countsFromTag a non-null byte[] containing the tag encoded reduce reads counts - * @return a non-null int[] containing the true depth values for the vector - */ - protected static int[] decodeReduceReadCounts(final byte[] countsFromTag) { - final int n = countsFromTag.length; - final int[] result = new int[n]; - final int firstCount = countsFromTag[0] & 0xff; // unsigned byte - result[0] = firstCount; - for ( int i = 1; i < n; i++ ) { - final int offsetCount = countsFromTag[i] & 0xff; // unsigned byte - result[i] = (firstCount + offsetCount) % 256; - } - - return result; - } - - /** - * Converts int array from straight counts to the appropriate reduce reads representation in BAM (offset from first value) - * - * @param counts the counts array - * @return non-null converted byte array - */ - protected static byte[] encodeReduceReadCounts(final int[] counts) { - if ( counts.length == 0 ) - throw new IllegalArgumentException("Trying to write a reduced read with a counts array of length 0"); - - final byte[] compressedCountsArray = new byte[counts.length]; - final int firstCount = (int) MathUtils.bound(counts[0], 0, 255); // we want an unsigned byte capped at max byte representation - compressedCountsArray[0] = (byte)firstCount; - for ( int i = 1; i < counts.length; i++ ) { - final int count = (int) MathUtils.bound(counts[i], 0, 255); - final byte offset = (byte)(count - firstCount + (count >= firstCount ? 0 : 256)); // unsigned byte - compressedCountsArray[i] = offset; - } - - return compressedCountsArray; - } - - /////////////////////////////////////////////////////////////////////////////// - // *** GATKSAMRecord specific methods ***// - /////////////////////////////////////////////////////////////////////////////// - - /** - * Checks whether an attribute has been set for the given key. - * - * Temporary attributes provide a way to label or attach arbitrary data to - * individual GATKSAMRecords. These attributes exist in memory only, - * and are never written to disk. - * - * @param key key - * @return True if an attribute has been set for this key. - */ - public boolean containsTemporaryAttribute(Object key) { - return temporaryAttributes != null && temporaryAttributes.containsKey(key); - } - - /** - * Sets the key to the given value, replacing any previous value. The previous - * value is returned. - * - * Temporary attributes provide a way to label or attach arbitrary data to - * individual GATKSAMRecords. These attributes exist in memory only, - * and are never written to disk. - * - * @param key key - * @param value value - * @return attribute - */ - public Object setTemporaryAttribute(Object key, Object value) { - if(temporaryAttributes == null) { - temporaryAttributes = new HashMap<>(); - } - return temporaryAttributes.put(key, value); - } - - /** - * Looks up the value associated with the given key. - * - * Temporary attributes provide a way to label or attach arbitrary data to - * individual GATKSAMRecords. These attributes exist in memory only, - * and are never written to disk. - * - * @param key key - * @return The value, or null. - */ - public Object getTemporaryAttribute(Object key) { - if(temporaryAttributes != null) { - return temporaryAttributes.get(key); - } - return null; - } - - /** - * Checks whether if the read has any bases. - * - * Empty reads can be dangerous as it may have no cigar strings, no read names and - * other missing attributes. - * - * @return true if the read has no bases - */ - public boolean isEmpty() { - return super.getReadBases() == null || super.getReadLength() == 0; - } - - /** - * Clears all attributes except ReadGroup of the read. - */ - public GATKSAMRecord simplify () { - GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information - byte[] insQuals = (this.getAttribute(BQSR_BASE_INSERTION_QUALITIES) == null) ? null : getBaseInsertionQualities(); - byte[] delQuals = (this.getAttribute(BQSR_BASE_DELETION_QUALITIES) == null) ? null : getBaseDeletionQualities(); - this.clearAttributes(); // clear all attributes from the read - this.setReadGroup(rg); // restore read group - if (insQuals != null) - this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any - if (delQuals != null) - this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any - return this; - } - - /** - * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips. - * - * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. - * - * @return the unclipped start of the read taking soft clips (but not hard clips) into account - */ - public int getSoftStart() { - if ( softStart == UNINITIALIZED ) { - softStart = getAlignmentStart(); - for (final CigarElement cig : getCigar().getCigarElements()) { - final CigarOperator op = cig.getOperator(); - - if (op == CigarOperator.SOFT_CLIP) - softStart -= cig.getLength(); - else if (op != CigarOperator.HARD_CLIP) - break; - } - } - return softStart; - } - - /** - * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. - * - * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. - * - * @return the unclipped end of the read taking soft clips (but not hard clips) into account - */ - public int getSoftEnd() { - if ( softEnd == UNINITIALIZED ) { - boolean foundAlignedBase = false; - softEnd = getAlignmentEnd(); - final List cigs = getCigar().getCigarElements(); - for (int i = cigs.size() - 1; i >= 0; --i) { - final CigarElement cig = cigs.get(i); - final CigarOperator op = cig.getOperator(); - - if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read - softEnd += cig.getLength(); - else if (op != CigarOperator.HARD_CLIP) { - foundAlignedBase = true; - break; - } - } - if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end - softEnd = getAlignmentEnd(); - } - } - - return softEnd; - } - - /** - * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache - * so that the next call to getSoftStart/End will recalculate it lazily. - */ - public void resetSoftStartAndEnd() { - softStart = -1; - softEnd = -1; - } - - /** - * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache - * so that the next call to getSoftStart/End will recalculate it lazily. - */ - public void resetSoftStartAndEnd(int softStart, int softEnd) { - this.softStart = softStart; - this.softEnd = softEnd; - } - - /** - * Determines the original alignment start of a previously clipped read. - * - * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end - * - * @return the alignment start of a read before it was clipped - */ - public int getOriginalAlignmentStart() { - int originalAlignmentStart = getUnclippedStart(); - Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); - if (alignmentShift != null) - originalAlignmentStart += alignmentShift; - return originalAlignmentStart; - } - - /** - * Determines the original alignment end of a previously clipped read. - * - * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end - * - * @return the alignment end of a read before it was clipped - */ - public int getOriginalAlignmentEnd() { - int originalAlignmentEnd = getUnclippedEnd(); - Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); - if (alignmentShift != null) - originalAlignmentEnd -= alignmentShift; - return originalAlignmentEnd; - } - - /** - * Creates an empty GATKSAMRecord with the read's header, read group and mate - * information, but empty (not-null) fields: - * - Cigar String - * - Read Bases - * - Base Qualities - * - * Use this method if you want to create a new empty GATKSAMRecord based on - * another GATKSAMRecord - * - * @param read a read to copy the header from - * @return a read with no bases but safe for the GATK - */ - public static GATKSAMRecord emptyRead(GATKSAMRecord read) { - GATKSAMRecord emptyRead = new GATKSAMRecord(read.getHeader(), - read.getReferenceIndex(), - 0, - (short) 0, - (short) 0, - 0, - 0, - read.getFlags(), - 0, - read.getMateReferenceIndex(), - read.getMateAlignmentStart(), - read.getInferredInsertSize(), - null); - - emptyRead.setCigarString(""); - emptyRead.setReadBases(new byte[0]); - emptyRead.setBaseQualities(new byte[0]); - if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new int[0]); - - SAMReadGroupRecord samRG = read.getReadGroup(); - emptyRead.clearAttributes(); - if (samRG != null) { - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); - emptyRead.setReadGroup(rg); - } - - return emptyRead; - } - - /** - * Creates a new GATKSAMRecord with the source read's header, read group and mate - * information, but with the following fields set to user-supplied values: - * - Read Bases - * - Base Qualities - * - Base Insertion Qualities - * - Base Deletion Qualities - * - * Cigar string is empty (not-null) - * - * Use this method if you want to create a new GATKSAMRecord based on - * another GATKSAMRecord, but with modified bases and qualities - * - * @param read a read to copy the header from - * @param readBases an array containing the new bases you wish use in place of the originals - * @param baseQualities an array containing the new base qualities you wish use in place of the originals - * @param baseInsertionQualities an array containing the new base insertion qaulities - * @param baseDeletionQualities an array containing the new base deletion qualities - * @return a read with modified bases and qualities, safe for the GATK - */ - public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read, - final byte[] readBases, - final byte[] baseQualities, - final byte[] baseInsertionQualities, - final byte[] baseDeletionQualities) { - if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length ) - throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length - + " vs Base Q's:" + baseQualities.length - + " vs Insert Q's:" + baseInsertionQualities.length - + " vs Delete Q's:" + baseDeletionQualities.length); - - final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read); - processedRead.setReadBases(readBases); - processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION); - processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION); - processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION); - - return processedRead; - } - - /** - * Shallow copy of everything, except for the attribute list and the temporary attributes. - * A new list of the attributes is created for both, but the attributes themselves are copied by reference. - * This should be safe because callers should never modify a mutable value returned by any of the get() methods anyway. - * - * @return a shallow copy of the GATKSAMRecord - * @throws CloneNotSupportedException - */ - @Override - public Object clone() throws CloneNotSupportedException { - final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); - if (temporaryAttributes != null) { - clone.temporaryAttributes = new HashMap<>(); - for (Object attribute : temporaryAttributes.keySet()) - clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); - } - return clone; - } - - /** - * A caching version of ReadUtils.getAdaptorBoundary() - * - * see #ReadUtils.getAdaptorBoundary(SAMRecord) for more information about the meaning of this function - * - * WARNING -- this function caches a value depending on the inferred insert size and alignment starts - * and stops of this read and its mate. Changing these values in any way will invalidate the cached value. - * However, we do not monitor those setter functions, so modifying a GATKSAMRecord in any way may - * result in stale cached values. - * - * @return the result of calling ReadUtils.getAdaptorBoundary on this read - */ - @Ensures("result == ReadUtils.getAdaptorBoundary(this)") - public int getAdaptorBoundary() { - if ( adapterBoundary == null ) - adapterBoundary = ReadUtils.getAdaptorBoundary(this); - return adapterBoundary; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java deleted file mode 100644 index 39f227840..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ /dev/null @@ -1,968 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.sam; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.*; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.util.*; - -/** - * A miscellaneous collection of utilities for working with SAM files, headers, etc. - * Static methods only, please. - * - * @author mhanna - * @version 0.1 - */ -public class ReadUtils { - private final static Logger logger = Logger.getLogger(ReadUtils.class); - - private static final String OFFSET_OUT_OF_BOUNDS_EXCEPTION = "Offset cannot be greater than read length %d : %d"; - private static final String OFFSET_NOT_ZERO_EXCEPTION = "We ran past the end of the read and never found the offset, something went wrong!"; - - private ReadUtils() { - } - - private static final int DEFAULT_ADAPTOR_SIZE = 100; - public static final int CLIPPING_GOAL_NOT_REACHED = -1; - - public static int getMeanRepresentativeReadCount(GATKSAMRecord read) { - if (!read.isReducedRead()) - return 1; - - // compute mean representative read counts - final int[] counts = read.getReducedReadCounts(); - return (int)Math.round((double)MathUtils.sum(counts)/counts.length); - } - - /** - * A marker to tell which end of the read has been clipped - */ - public enum ClippingTail { - LEFT_TAIL, - RIGHT_TAIL - } - - /** - * A HashMap of the SAM spec read flag names - * - * Note: This is not being used right now, but can be useful in the future - */ - private static final Map readFlagNames = new HashMap(); - - static { - readFlagNames.put(0x1, "Paired"); - readFlagNames.put(0x2, "Proper"); - readFlagNames.put(0x4, "Unmapped"); - readFlagNames.put(0x8, "MateUnmapped"); - readFlagNames.put(0x10, "Forward"); - //readFlagNames.put(0x20, "MateForward"); - readFlagNames.put(0x40, "FirstOfPair"); - readFlagNames.put(0x80, "SecondOfPair"); - readFlagNames.put(0x100, "NotPrimary"); - readFlagNames.put(0x200, "NON-PF"); - readFlagNames.put(0x400, "Duplicate"); - } - - /** - * This enum represents all the different ways in which a read can overlap an interval. - * - * NO_OVERLAP_CONTIG: - * read and interval are in different contigs. - * - * NO_OVERLAP_LEFT: - * the read does not overlap the interval. - * - * |----------------| (interval) - * <----------------> (read) - * - * NO_OVERLAP_RIGHT: - * the read does not overlap the interval. - * - * |----------------| (interval) - * <----------------> (read) - * - * OVERLAP_LEFT: - * the read starts before the beginning of the interval but ends inside of it - * - * |----------------| (interval) - * <----------------> (read) - * - * OVERLAP_RIGHT: - * the read starts inside the interval but ends outside of it - * - * |----------------| (interval) - * <----------------> (read) - * - * OVERLAP_LEFT_AND_RIGHT: - * the read starts before the interval and ends after the interval - * - * |-----------| (interval) - * <-------------------> (read) - * - * OVERLAP_CONTAINED: - * the read starts and ends inside the interval - * - * |----------------| (interval) - * <--------> (read) - */ - public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} - - /** - * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular - * SAMFileWriter without compression otherwise. - * - * @param header - * @param presorted - * @param file - * @param compression - * @return a SAMFileWriter with the compression level if it is a bam. - */ - public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { - validateCompressionLevel(compression); - if (file.endsWith(".bam")) - return new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, presorted, new File(file), compression); - return new SAMFileWriterFactory().setCreateIndex(true).makeSAMOrBAMWriter(header, presorted, new File(file)); - } - - public static int validateCompressionLevel(final int requestedCompressionLevel) { - if ( requestedCompressionLevel < 0 || requestedCompressionLevel > 9 ) - throw new UserException.BadArgumentValue("compress", "Compression level must be 0-9 but got " + requestedCompressionLevel); - return requestedCompressionLevel; - } - - /** - * is this base inside the adaptor of the read? - * - * There are two cases to treat here: - * - * 1) Read is in the negative strand => Adaptor boundary is on the left tail - * 2) Read is in the positive strand => Adaptor boundary is on the right tail - * - * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) - * - * @param read the read to test - * @param basePos base position in REFERENCE coordinates (not read coordinates) - * @return whether or not the base is in the adaptor - */ - public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { - final int adaptorBoundary = read.getAdaptorBoundary(); - if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) - return false; - - return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; - } - - /** - * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to - * the read boundary. If the read is in the positive strand, this is the first base after the end of the - * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the - * beginning of the fragment. - * - * There are two cases we need to treat here: - * - * 1) Our read is in the reverse strand : - * - * <----------------------| * - * |---------------------> - * - * in these cases, the adaptor boundary is at the mate start (minus one) - * - * 2) Our read is in the forward strand : - * - * |----------------------> * - * <----------------------| - * - * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) - * - * @param read the read being tested for the adaptor boundary - * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. - * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. - */ - public static int getAdaptorBoundary(final SAMRecord read) { - if ( ! hasWellDefinedFragmentSize(read) ) { - return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; - } else if ( read.getReadNegativeStrandFlag() ) { - return read.getMateAlignmentStart() - 1; // case 1 (see header) - } else { - final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) - return read.getAlignmentStart() + insertSize + 1; // case 2 (see header) - } - } - - public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; - - /** - * Can the adaptor sequence of read be reliably removed from the read based on the alignment of - * read and its mate? - * - * @param read the read to check - * @return true if it can, false otherwise - */ - public static boolean hasWellDefinedFragmentSize(final SAMRecord read) { - if ( read.getInferredInsertSize() == 0 ) - // no adaptors in reads with mates in another chromosome or unmapped pairs - return false; - if ( ! read.getReadPairedFlag() ) - // only reads that are paired can be adaptor trimmed - return false; - if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() ) - // only reads when both reads are mapped can be trimmed - return false; -// if ( ! read.getProperPairFlag() ) -// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs -// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed -// return false; - if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) - // sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand - return false; - - if ( read.getReadNegativeStrandFlag() ) { - // we're on the negative strand, so our read runs right to left - return read.getAlignmentEnd() > read.getMateAlignmentStart(); - } else { - // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start) - return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize(); - } - } - - /** - * is the read a 454 read? - * - * @param read the read to test - * @return checks the read group tag PL for the default 454 tag - */ - public static boolean is454Read(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.LS454; - } - - /** - * is the read an IonTorrent read? - * - * @param read the read to test - * @return checks the read group tag PL for the default ion tag - */ - public static boolean isIonRead(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.ION_TORRENT; - } - - /** - * is the read a SOLiD read? - * - * @param read the read to test - * @return checks the read group tag PL for the default SOLiD tag - */ - public static boolean isSOLiDRead(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.SOLID; - } - - /** - * is the read a SLX read? - * - * @param read the read to test - * @return checks the read group tag PL for the default SLX tag - */ - public static boolean isIlluminaRead(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.ILLUMINA; - } - - /** - * checks if the read has a platform tag in the readgroup equal to 'name'. - * Assumes that 'name' is upper-cased. - * - * @param read the read to test - * @param name the upper-cased platform name to test - * @return whether or not name == PL tag in the read group of read - */ - public static boolean isPlatformRead(GATKSAMRecord read, String name) { - - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; - } - - - /** - * Returns the collections of reads sorted in coordinate order, according to the order defined - * in the reads themselves - * - * @param reads - * @return - */ - public final static List sortReadsByCoordinate(List reads) { - final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); - Collections.sort(reads, comparer); - return reads; - } - - /** - * If a read starts in INSERTION, returns the first element length. - * - * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. - * - * @param read - * @return the length of the first insertion, or 0 if there is none (see warning). - */ - public final static int getFirstInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(0); - if ( e.getOperator() == CigarOperator.I ) - return e.getLength(); - else - return 0; - } - - /** - * If a read ends in INSERTION, returns the last element length. - * - * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. - * - * @param read - * @return the length of the last insertion, or 0 if there is none (see warning). - */ - public final static int getLastInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); - if ( e.getOperator() == CigarOperator.I ) - return e.getLength(); - else - return 0; - } - - /** - * Determines what is the position of the read in relation to the interval. - * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. - * @param read the read - * @param interval the interval - * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) - */ - public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { - - int sStart = read.getSoftStart(); - int sStop = read.getSoftEnd(); - int uStart = read.getUnclippedStart(); - int uStop = read.getUnclippedEnd(); - - if ( !read.getReferenceName().equals(interval.getContig()) ) - return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - - else if ( uStop < interval.getStart() ) - return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - - else if ( uStart > interval.getStop() ) - return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - - else if ( sStop < interval.getStart() ) - return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; - - else if ( sStart > interval.getStop() ) - return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; - - else if ( (sStart >= interval.getStart()) && - (sStop <= interval.getStop()) ) - return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - - else if ( (sStart < interval.getStart()) && - (sStop > interval.getStop()) ) - return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - - else if ( (sStart < interval.getStart()) ) - return ReadAndIntervalOverlap.OVERLAP_LEFT; - - else - return ReadAndIntervalOverlap.OVERLAP_RIGHT; - } - - /** - * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of - * two corner cases: - * - * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside - * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it - * doesn't matter because it already returns the previous base by default. - * - * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the - * read starts with an insertion, and you're requesting the first read based coordinate, it will skip - * the leading insertion (because it has the same reference coordinate as the following base). - * - * @param read - * @param refCoord - * @param tail - * @return the read coordinate corresponding to the requested reference coordinate for clipping. - */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) - @Ensures({"result >= 0", "result < read.getReadLength()"}) - public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { - return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); - } - - public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) { - final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord); - return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false); - } - - public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { - Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); - int readCoord = result.getFirst(); - - // Corner case one: clipping the right tail and falls on deletion, move to the next - // read coordinate. It is not a problem for the left tail because the default answer - // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. - if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) - readCoord++; - - // clipping the left tail and first base is insertion, go to the next read coordinate - // with the same reference coordinate. Advance to the next cigar element, or to the - // end of the read if there is no next element. - final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar); - if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null) - readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1); - - return readCoord; - } - - /** - * Returns the read coordinate corresponding to the requested reference coordinate. - * - * WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function - * will return the last read base before the deletion. This function returns a - * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with - * a deletion. - * - * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a - * pre-processed result according to normal clipping needs. Or you can use this function and tailor the - * behavior to your needs. - * - * @param read - * @param refCoord - * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) - */ - @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) - @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) - public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { - return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); - } - - public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { - int readBases = 0; - int refBases = 0; - boolean fallsInsideDeletion = false; - - int goal = refCoord - alignmentStart; // The goal is to move this many reference bases - if (goal < 0) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - } - } - boolean goalReached = refBases == goal; - - Iterator cigarElementIterator = cigar.getCigarElements().iterator(); - while (!goalReached && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - int shift = 0; - - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { - if (refBases + cigarElement.getLength() < goal) - shift = cigarElement.getLength(); - else - shift = goal - refBases; - - refBases += shift; - } - goalReached = refBases == goal; - - if (!goalReached && cigarElement.getOperator().consumesReadBases()) - readBases += cigarElement.getLength(); - - if (goalReached) { - // Is this base's reference position within this cigar element? Or did we use it all? - boolean endsWithinCigar = shift < cigarElement.getLength(); - - // If it isn't, we need to check the next one. There should *ALWAYS* be a next one - // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); - } - } - - CigarElement nextCigarElement; - - // if we end inside the current cigar element, we just have to check if it is a deletion - if (endsWithinCigar) - fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. - else { - nextCigarElement = cigarElementIterator.next(); - - // if it's an insertion, we need to clip the whole insertion before looking at the next element - if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { - readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); - } - } - - nextCigarElement = cigarElementIterator.next(); - } - - // if it's a deletion, we will pass the information on to be handled downstream. - fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; - } - - // If we reached our goal outside a deletion, add the shift - if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) - readBases += shift; - - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases()) - readBases += shift - 1; - - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion - else if (fallsInsideDeletion && endsWithinCigar) - readBases--; - } - } - - if (!goalReached) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); - } - } - - return new Pair(readBases, fallsInsideDeletion); - } - - /** - * Compares two SAMRecords only the basis on alignment start. Note that - * comparisons are performed ONLY on the basis of alignment start; any - * two SAM records with the same alignment start will be considered equal. - * - * Unmapped alignments will all be considered equal. - */ - - @Requires({"read1 != null", "read2 != null"}) - public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { - AlignmentStartComparator comp = new AlignmentStartComparator(); - return comp.compare(read1, read2); - } - - /** - * Is a base inside a read? - * - * @param read the read to evaluate - * @param referenceCoordinate the reference coordinate of the base to test - * @return true if it is inside the read, false otherwise. - */ - public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { - return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); - } - - /** - * Is this read all insertion? - * - * @param read - * @return whether or not the only element in the cigar string is an Insertion - */ - public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.INSERTION) - return false; - } - return true; - } - - /** - * @see #readStartsWithInsertion(net.sf.samtools.Cigar, boolean) with ignoreClipOps set to true - */ - public static CigarElement readStartsWithInsertion(final Cigar cigarForRead) { - return readStartsWithInsertion(cigarForRead, true); - } - - /** - * Checks if a read starts with an insertion. - * - * @param cigarForRead the CIGAR to evaluate - * @param ignoreSoftClipOps should we ignore S operators when evaluating whether an I operator is at the beginning? Note that H operators are always ignored. - * @return the element if it's a leading insertion or null otherwise - */ - public static CigarElement readStartsWithInsertion(final Cigar cigarForRead, final boolean ignoreSoftClipOps) { - for ( final CigarElement cigarElement : cigarForRead.getCigarElements() ) { - if ( cigarElement.getOperator() == CigarOperator.INSERTION ) - return cigarElement; - - else if ( cigarElement.getOperator() != CigarOperator.HARD_CLIP && ( !ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP) ) - break; - } - return null; - } - - /** - * Returns the coverage distribution of a list of reads within the desired region. - * - * See getCoverageDistributionOfRead for information on how the coverage is calculated. - * - * @param list the list of reads covering the region - * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) - * @return an array with the coverage of each position from startLocation to stopLocation - */ - public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { - int [] totalCoverage = new int[stopLocation - startLocation + 1]; - - for (GATKSAMRecord read : list) { - int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); - totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); - } - - return totalCoverage; - } - - /** - * Returns the coverage distribution of a single read within the desired region. - * - * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample - * reads for variant regions, and deletions count as variants) - * - * @param read the read to get the coverage distribution of - * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) - * @return an array with the coverage of each position from startLocation to stopLocation - */ - public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { - int [] coverage = new int[stopLocation - startLocation + 1]; - int refLocation = read.getSoftStart(); - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - switch (cigarElement.getOperator()) { - case S: - case M: - case EQ: - case N: - case X: - case D: - for (int i = 0; i < cigarElement.getLength(); i++) { - if (refLocation >= startLocation && refLocation <= stopLocation) { - int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; - coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases - } - refLocation++; - } - break; - - case P: - case I: - case H: - break; - } - - if (refLocation > stopLocation) - break; - } - return coverage; - } - - /** - * Makes association maps for the reads and loci coverage as described below : - * - * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. - * Note: Locus is in reference coordinates. - * Example: Locus => {read1, read2, ..., readN} - * - * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. - * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. - * Example: Read => {true, true, false, ... false} - * - * @param readList the list of reads to generate the association mappings - * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) - * @return the two hashmaps described above - */ - public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { - int arraySize = stopLocation - startLocation + 1; - - HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); - HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); - - for (int i = startLocation; i <= stopLocation; i++) - locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists - - for (GATKSAMRecord read : readList) { - readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays - - int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); - - for (int i = 0; i < readCoverage.length; i++) { - int refLocation = i + startLocation; - if (readCoverage[i] > 0) { - // Update the hash for this locus - HashSet readSet = locusToReadMap.get(refLocation); - readSet.add(read); - - // Add this locus to the read hash - readToLocusMap.get(read)[refLocation - startLocation] = true; - } - else - // Update the boolean array with a 'no coverage' from this read to this locus - readToLocusMap.get(read)[refLocation-startLocation] = false; - } - } - return new Pair>, HashMap>(locusToReadMap, readToLocusMap); - } - - /** - * Create random read qualities - * - * @param length the length of the read - * @return an array with randomized base qualities between 0 and 50 - */ - public static byte[] createRandomReadQuals(int length) { - Random random = GenomeAnalysisEngine.getRandomGenerator(); - byte[] quals = new byte[length]; - for (int i = 0; i < length; i++) - quals[i] = (byte) random.nextInt(50); - return quals; - } - - /** - * Create random read qualities - * - * @param length the length of the read - * @param allowNs whether or not to allow N's in the read - * @return an array with randomized bases (A-N) with equal probability - */ - public static byte[] createRandomReadBases(int length, boolean allowNs) { - Random random = GenomeAnalysisEngine.getRandomGenerator(); - int numberOfBases = allowNs ? 5 : 4; - byte[] bases = new byte[length]; - for (int i = 0; i < length; i++) { - switch (random.nextInt(numberOfBases)) { - case 0: - bases[i] = 'A'; - break; - case 1: - bases[i] = 'C'; - break; - case 2: - bases[i] = 'G'; - break; - case 3: - bases[i] = 'T'; - break; - case 4: - bases[i] = 'N'; - break; - default: - throw new ReviewedStingException("Something went wrong, this is just impossible"); - } - } - return bases; - } - - public static GATKSAMRecord createRandomRead(int length) { - return createRandomRead(length, true); - } - - public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { - byte[] quals = ReadUtils.createRandomReadQuals(length); - byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); - return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); - } - - - public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { - String[] sequenceRecordNames = new String[sequenceDictionary.size()]; - int sequenceRecordIndex = 0; - for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) - sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); - return Arrays.deepToString(sequenceRecordNames); - } - - /** - * Calculates the reference coordinate for a read coordinate - * - * @param read the read - * @param offset the base in the read (coordinate in the read) - * @return the reference coordinate correspondent to this base - */ - public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { - if (offset > read.getReadLength()) - throw new ReviewedStingException(String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); - - long location = read.getAlignmentStart(); - Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); - while (offset > 0 && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - long move = 0; - if (cigarElement.getOperator().consumesReferenceBases()) - move = (long) Math.min(cigarElement.getLength(), offset); - location += move; - offset -= move; - } - if (offset > 0 && !cigarElementIterator.hasNext()) - throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION); - - return location; - } - - /** - * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. - * - * Example: - * D -> 2, 34, 75 - * I -> 55 - * S -> 0, 101 - * H -> 101 - * - * @param read the read - * @return a map with the properties described above. See example - */ - public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { - Map> events = new HashMap>(); - - int position = 0; - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - CigarOperator op = cigarElement.getOperator(); - if (op.consumesReadBases()) { - ArrayList list = events.get(op); - if (list == null) { - list = new ArrayList(); - events.put(op, list); - } - for (int i = position; i < cigarElement.getLength(); i++) - list.add(position++); - } - else { - ArrayList list = events.get(op); - if (list == null) { - list = new ArrayList(); - events.put(op, list); - } - list.add(position); - } - } - return events; - } - - /** - * Given a read, outputs the read bases in a string format - * - * @param read the read - * @return a string representation of the read bases - */ - public static String convertReadBasesToString(GATKSAMRecord read) { - String bases = ""; - for (byte b : read.getReadBases()) { - bases += (char) b; - } - return bases.toUpperCase(); - } - - /** - * Given a read, outputs the base qualities in a string format - * - * @param quals the read qualities - * @return a string representation of the base qualities - */ - public static String convertReadQualToString(byte[] quals) { - String result = ""; - for (byte b : quals) { - result += (char) (33 + b); - } - return result; - } - - /** - * Given a read, outputs the base qualities in a string format - * - * @param read the read - * @return a string representation of the base qualities - */ - public static String convertReadQualToString(GATKSAMRecord read) { - return convertReadQualToString(read.getBaseQualities()); - } - - /** - * Returns the reverse complement of the read bases - * - * @param bases the read bases - * @return the reverse complement of the read bases - */ - public static String getBasesReverseComplement(byte[] bases) { - String reverse = ""; - for (int i = bases.length-1; i >=0; i--) { - reverse += (char) BaseUtils.getComplement(bases[i]); - } - return reverse; - } - - /** - * Returns the reverse complement of the read bases - * - * @param read the read - * @return the reverse complement of the read bases - */ - public static String getBasesReverseComplement(GATKSAMRecord read) { - return getBasesReverseComplement(read.getReadBases()); - } - - /** - * Calculate the maximum read length from the given list of reads. - * @param reads list of reads - * @return non-negative integer - */ - @Ensures({"result >= 0"}) - public static int getMaxReadLength( final List reads ) { - if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); } - - int maxReadLength = 0; - for( final GATKSAMRecord read : reads ) { - maxReadLength = Math.max(maxReadLength, read.getReadLength()); - } - return maxReadLength; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java deleted file mode 100644 index 1ae34e268..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ /dev/null @@ -1,1835 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.variant; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broad.tribble.TribbleException; -import org.broad.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.io.Serializable; -import java.util.*; - -public class GATKVariantContextUtils { - - private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); - - public static final int DEFAULT_PLOIDY = 2; - public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - - public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; - public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - - public final static String MERGE_FILTER_PREFIX = "filterIn"; - public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; - public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; - public final static String MERGE_INTERSECTION = "Intersection"; - - public enum GenotypeMergeType { - /** - * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. - */ - UNIQUIFY, - /** - * Take genotypes in priority order (see the priority argument). - */ - PRIORITIZE, - /** - * Take the genotypes in any order. - */ - UNSORTED, - /** - * Require that all samples/genotypes be unique between all inputs. - */ - REQUIRE_UNIQUE - } - - public enum FilteredRecordMergeType { - /** - * Union - leaves the record if any record is unfiltered. - */ - KEEP_IF_ANY_UNFILTERED, - /** - * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. - */ - KEEP_IF_ALL_UNFILTERED, - /** - * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. - */ - KEEP_UNCONDITIONAL - } - - public enum MultipleAllelesMergeType { - /** - * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. - */ - BY_TYPE, - /** - * Merge all allele types at the same start position into the same VCF record. - */ - MIX_TYPES - } - - /** - * Refactored out of the AverageAltAlleleLength annotation class - * @param vc the variant context - * @return the average length of the alt allele (a double) - */ - public static double getMeanAltAlleleLength(VariantContext vc) { - double averageLength = 1.0; - if ( ! vc.isSNP() && ! vc.isSymbolic() ) { - // adjust for the event length - int averageLengthNum = 0; - int averageLengthDenom = 0; - int refLength = vc.getReference().length(); - for ( final Allele a : vc.getAlternateAlleles() ) { - int numAllele = vc.getCalledChrCount(a); - int alleleSize; - if ( a.length() == refLength ) { - // SNP or MNP - byte[] a_bases = a.getBases(); - byte[] ref_bases = vc.getReference().getBases(); - int n_mismatch = 0; - for ( int idx = 0; idx < a_bases.length; idx++ ) { - if ( a_bases[idx] != ref_bases[idx] ) - n_mismatch++; - } - alleleSize = n_mismatch; - } - else if ( a.isSymbolic() ) { - alleleSize = 1; - } else { - alleleSize = Math.abs(refLength-a.length()); - } - averageLengthNum += alleleSize*numAllele; - averageLengthDenom += numAllele; - } - averageLength = ( (double) averageLengthNum )/averageLengthDenom; - } - - return averageLength; - } - - /** - * create a genome location, given a variant context - * @param genomeLocParser parser - * @param vc the variant context - * @return the genomeLoc - */ - public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { - return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); - } - - public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { - if (!context.isSNP() || !context.isBiallelic()) - throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); - return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); - } - - /** - * If this is a BiAllelic SNP, is it a transition? - */ - public static boolean isTransition(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - /** - * If this is a BiAllelic SNP, is it a transversion? - */ - public static boolean isTransversion(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - public static boolean isTransition(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - public static boolean isTransversion(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - /** - * Returns a context identical to this with the REF and ALT alleles reverse complemented. - * - * @param vc variant context - * @return new vc - */ - public static VariantContext reverseComplement(VariantContext vc) { - // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); - for ( final Allele originalAllele : vc.getAlleles() ) { - Allele newAllele; - if ( originalAllele.isNoCall() ) - newAllele = originalAllele; - else - newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); - alleleMap.put(originalAllele, newAllele); - } - - // create new Genotype objects - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList<>(); - for ( final Allele allele : genotype.getAlleles() ) { - Allele newAllele = alleleMap.get(allele); - if ( newAllele == null ) - newAllele = Allele.NO_CALL; - newAlleles.add(newAllele); - } - newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); - } - - return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); - } - - /** - * Returns true iff VC is an non-complex indel where every allele represents an expansion or - * contraction of a series of identical bases in the reference. - * - * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT - * - * If VC = -/CT, then this function returns true because the CT insertion matches exactly the - * upcoming reference. - * If VC = -/CTA then this function returns false because the CTA isn't a perfect match - * - * Now consider deletions: - * - * If VC = CT/- then again the same logic applies and this returns true - * The case of CTA/- makes no sense because it doesn't actually match the reference bases. - * - * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For - * each insertion allele of n bases, check if that allele matches the next n reference bases. - * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, - * as it must necessarily match the first n bases. If this test returns true for all - * alleles you are a tandem repeat, otherwise you are not. - * - * @param vc - * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return false; - - final Allele ref = vc.getReference(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) - return false; - } - - // we've passed all of the tests, so we are a repeat - return true; - } - - /** - * - * @param vc - * @param refBasesStartingAtVCWithPad - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final boolean VERBOSE = false; - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return null; - - final Allele refAllele = vc.getReference(); - final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); - - byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList<>(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); - - final int[] repetitionCount = result.first; - // repetition count = 0 means allele is not a tandem expansion of context - if (repetitionCount[0] == 0 || repetitionCount[1] == 0) - return null; - - if (lengths.size() == 0) { - lengths.add(repetitionCount[0]); // add ref allele length only once - } - lengths.add(repetitionCount[1]); // add this alt allele's length - - repeatUnit = result.second; - if (VERBOSE) { - System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); - System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); - System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); - System.out.println("RU:"+new String(repeatUnit)); - } - } - - return new Pair, byte[]>(lengths,repeatUnit); - } - - public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { - /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. - Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. - */ - - byte[] longB; - // find first repeat unit based on either ref or alt, whichever is longer - if (altBases.length > refBases.length) - longB = altBases; - else - longB = refBases; - - // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units - // for example, -*,CACA needs to first be decomposed into (CA)2 - final int repeatUnitLength = findRepeatedSubstring(longB); - final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); - - final int[] repetitionCount = new int[2]; - // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) - int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); - repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; - repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - - return new Pair<>(repetitionCount, repeatUnit); - - } - - /** - * Find out if a string can be represented as a tandem number of substrings. - * For example ACTACT is a 2-tandem of ACT, - * but ACTACA is not. - * - * @param bases String to be tested - * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't - * be represented as one, it will be just the length of the input string) - */ - public static int findRepeatedSubstring(byte[] bases) { - - int repLength; - for (repLength=1; repLength <=bases.length; repLength++) { - final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); - boolean allBasesMatch = true; - for (int start = repLength; start < bases.length; start += repLength ) { - // check that remaining of string is exactly equal to repeat unit - final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); - if (!Arrays.equals(candidateRepeatUnit, basePiece)) { - allBasesMatch = false; - break; - } - } - if (allBasesMatch) - return repLength; - } - - return repLength; - } - - /** - * Helper routine that finds number of repetitions a string consists of. - * For example, for string ATAT and repeat unit AT, number of repetitions = 2 - * @param repeatUnit Substring - * @param testString String to test - * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) - * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's - */ - public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { - int numRepeats = 0; - if (lookForward) { - // look forward on the test string - for (int start = 0; start < testString.length; start += repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 - // look forward on the test string - for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - /** - * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference - * @param ref - * @param alt - * @param refBasesStartingAtVCWithoutPad - * @return - */ - protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { - if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) - return false; // we require one allele be a prefix of another - - if ( ref.length() > alt.length() ) { // we are a deletion - return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); - } else { // we are an insertion - return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); - } - } - - protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { - final String potentialRepeat = l.substring(s.length()); // skip s bases - - for ( int i = 0; i < minNumberOfMatches; i++) { - final int start = i * potentialRepeat.length(); - final int end = (i+1) * potentialRepeat.length(); - if ( ref.length() < end ) - return false; // we ran out of bases to test - final String refSub = ref.substring(start, end); - if ( ! refSub.equals(potentialRepeat) ) - return false; // repeat didn't match, fail - } - - return true; // we passed all tests, we matched - } - - public enum GenotypeAssignmentMethod { - /** - * set all of the genotype GT values to NO_CALL - */ - SET_TO_NO_CALL, - - /** - * Use the subsetted PLs to greedily assigned genotypes - */ - USE_PLS_TO_ASSIGN, - - /** - * Try to match the original GT calls, if at all possible - * - * Suppose I have 3 alleles: A/B/C and the following samples: - * - * original_GT best_match to A/B best_match to A/C - * S1 => A/A A/A A/A - * S2 => A/B A/B A/A - * S3 => B/B B/B A/A - * S4 => B/C A/B A/C - * S5 => C/C A/A C/C - * - * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes - * when split into 2 bi-allelic variants will be het in each, which is good in some cases, - * rather than the undetermined behavior when using the PLs to assign, which could result - * in hom-var or hom-ref for each, depending on the exact PL values. - */ - BEST_MATCH_TO_ORIGINAL, - - /** - * do not even bother changing the GTs - */ - DO_NOT_ASSIGN_GENOTYPES - } - - /** - * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) - * - * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext - */ - public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, - final List allelesToUse, - final GenotypeAssignmentMethod assignGenotypes) { - if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); - if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); - - // optimization: if no input genotypes, just exit - if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); - - // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); - } - - /** - * Figure out which likelihood indexes to use for a selected down set of alleles - * - * @param originalVC the original VariantContext - * @param allelesToUse the subset of alleles to use - * @return a list of PL indexes to use or null if none - */ - private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { - - // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); - - // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, - // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) - return null; - - return getLikelihoodIndexes(originalVC, alleleIndexesToUse); - } - - /** - * Get the actual likelihoods indexes to use given the corresponding allele indexes - * - * @param originalVC the original VariantContext - * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) - * @return a non-null List - */ - private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { - - final List result = new ArrayList<>(30); - - // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); - - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) - result.add(PLindex); - } - - return result; - } - - /** - * Given an original VariantContext and a list of alleles from that VC to keep, - * returns a bitset representing which allele indexes should be kept - * - * @param originalVC the original VC - * @param allelesToKeep the list of alleles to keep - * @return non-null bitset - */ - private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { - final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; - final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; - - // the reference Allele is definitely still used - alleleIndexesToKeep[0] = true; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) - alleleIndexesToKeep[i+1] = true; - } - - return alleleIndexesToKeep; - } - - /** - * Create the new GenotypesContext with the subsetted PLs - * - * @param originalGs the original GenotypesContext - * @param vc the original VariantContext - * @param allelesToUse the actual alleles to use with the new Genotypes - * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) - * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext - */ - private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, - final VariantContext vc, - final List allelesToUse, - final List likelihoodIndexesToUse, - final GenotypeAssignmentMethod assignGenotypes) { - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - - // make sure we are seeing the expected number of likelihoods per sample - final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); - - // the samples - final List sampleIndices = originalGs.getSampleNamesOrderedByName(); - - // create the new genotypes - for ( int k = 0; k < originalGs.size(); k++ ) { - final Genotype g = originalGs.get(sampleIndices.get(k)); - final GenotypeBuilder gb = new GenotypeBuilder(g); - - // create the new likelihoods array from the alleles we are allowed to use - double[] newLikelihoods; - if ( !g.hasLikelihoods() ) { - // we don't have any likelihoods, so we null out PLs and make G ./. - newLikelihoods = null; - gb.noPL(); - } else { - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { - logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); - newLikelihoods = null; - } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( final int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) - gb.noPL(); - else - gb.PL(newLikelihoods); - } - - updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); - newGTs.add(gb.make()); - } - - return newGTs; - } - - private static boolean likelihoodsAreUninformative(final double[] likelihoods) { - return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; - } - - /** - * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod - * - * @param originalGT the original genotype calls, cannot be null - * @param gb the builder where we should put our newly called alleles, cannot be null - * @param assignmentMethod the method to use to do the assignment, cannot be null - * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null - * @param allelesToUse the alleles we are using for our subsetting - */ - public static void updateGenotypeAfterSubsetting(final List originalGT, - final GenotypeBuilder gb, - final GenotypeAssignmentMethod assignmentMethod, - final double[] newLikelihoods, - final List allelesToUse) { - switch ( assignmentMethod ) { - case DO_NOT_ASSIGN_GENOTYPES: - break; - case SET_TO_NO_CALL: - gb.alleles(NO_CALL_ALLELES); - gb.noAD(); - gb.noGQ(); - break; - case USE_PLS_TO_ASSIGN: - gb.noAD(); - if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { - // if there is no mass on the (new) likelihoods, then just no-call the sample - gb.alleles(NO_CALL_ALLELES); - gb.noGQ(); - } else { - // find the genotype with maximum likelihoods - final int PLindex = MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); - gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - break; - case BEST_MATCH_TO_ORIGINAL: - final List best = new LinkedList<>(); - final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument - for ( final Allele originalAllele : originalGT ) { - best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); - } - gb.noGQ(); - gb.noPL(); - gb.noAD(); - gb.alleles(best); - break; - } - } - - /** - * Subset the samples in VC to reference only information with ref call alleles - * - * Preserves DP if present - * - * @param vc the variant context to subset down to - * @param ploidy ploidy to use if a genotype doesn't have any alleles - * @return a GenotypesContext - */ - public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { - if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); - if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); - - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // optimization: if no input genotypes, just exit - if (oldGTs.isEmpty()) return oldGTs; - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); - - final Allele ref = vc.getReference(); - final List diploidRefAlleles = Arrays.asList(ref, ref); - - // create the new genotypes - for ( final Genotype g : vc.getGenotypes() ) { - final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); - final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); - final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); - if ( g.hasDP() ) gb.DP(g.getDP()); - if ( g.hasGQ() ) gb.GQ(g.getGQ()); - newGTs.add(gb.make()); - } - - return newGTs; - } - - /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs - * - * @param vc variant context with genotype likelihoods - * @return genotypes context - */ - public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { - return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc) { - return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { - if ( ! vc.isVariant() || vc.isBiallelic() ) - // non variant or biallelics already satisfy the contract - return Collections.singletonList(vc); - else { - final List biallelics = new LinkedList<>(); - - for ( final Allele alt : vc.getAlternateAlleles() ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - final List alleles = Arrays.asList(vc.getReference(), alt); - builder.alleles(alleles); - builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); - VariantContextUtils.calculateChromosomeCounts(builder, true); - final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); - biallelics.add(trimmed); - } - - return biallelics; - } - } - - public static Genotype removePLsAndAD(final Genotype g) { - return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC, - final boolean combineAnnotations ) { - int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, Collections.emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations); - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name. - * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. - * - * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ - * - * @param unsortedVCs collection of unsorted VCs - * @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final Collection potentialRefVCs, - final List priorityListOfVCs, - final int originalNumOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC, - final boolean combineAnnotations ) { - - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) - return null; - - if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) - throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); - - if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); - - final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); - // Make sure all variant contexts are padded with reference base in case of indels if necessary - List VCs = new ArrayList<>(); - - for (final VariantContext vc : preFilteredVCs) { - if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(vc); - } - - // cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible - VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs); - - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled - return null; - - // establish the baseline info from the first VC - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - final Allele refAllele = determineReferenceAllele(VCs); - - final Set alleles = new LinkedHashSet<>(); - final Set filters = new HashSet<>(); - final Map attributes = new LinkedHashMap<>(); - final Set inconsistentAttributes = new HashSet<>(); - final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - - VariantContext longestVC = first; - int depth = 0; - int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap<>(); - final Map> annotationMap = new LinkedHashMap<>(); - double log10PError = CommonInfo.NO_LOG10_PERROR; - boolean anyVCHadFiltersApplied = false; - VariantContext vcWithMaxAC = null; - GenotypesContext genotypes = GenotypesContext.create(); - - // counting the number of filtered and variant VCs - int nFiltered = 0; - - boolean remapped = false; - - // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( final VariantContext vc : VCs ) { - if ( longestVC.getStart() != vc.getStart() ) - throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); - - if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) - longestVC = vc; // get the longest location - - nFiltered += vc.isFiltered() ? 1 : 0; - if ( vc.isVariant() ) variantSources.add(vc.getSource()); - - AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); - remapped = remapped || alleleMapping.needsRemapping(); - - alleles.addAll(alleleMapping.values()); - - mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - - // We always take the QUAL of the first VC with a non-MISSING qual for the combined value - if ( log10PError == CommonInfo.NO_LOG10_PERROR ) - log10PError = vc.getLog10PError(); - - filters.addAll(vc.getFilters()); - anyVCHadFiltersApplied |= vc.filtersWereApplied(); - - // - // add attributes - // - // special case DP (add it up) and ID (just preserve it) - // - if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() ) rsIDs.add(vc.getID()); - if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a "," separator - if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (final String alleleCount : alleleCountArray) { - final int ac = Integer.valueOf(alleleCount.trim()); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } else { - final int ac = Integer.valueOf(rawAlleleCounts); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } - - for (final Map.Entry p : vc.getAttributes().entrySet()) { - final String key = p.getKey(); - final Object value = p.getValue(); - boolean badAnnotation = false; - if ( combineAnnotations ) { // add the annotation values to a list for combining later - List values = annotationMap.get(key); - if( values == null ) { - values = new ArrayList<>(); - annotationMap.put(key, values); - } - try { - final String stringValue = value.toString(); - // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. - if (stringValue.contains(".")) - values.add(Double.parseDouble(stringValue)); - else - values.add(Integer.parseInt(stringValue)); - } catch (NumberFormatException e) { - badAnnotation = true; - } - } - if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - - if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - attributes.put(key, value); - } - } - } - } - } - - // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD - for ( final VariantContext vc : VCs ) { - if (vc.getAlleles().size() == 1) - continue; - if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { - if ( ! genotypes.isEmpty() ) { - logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", - vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); - } - genotypes = stripPLsAndAD(genotypes); - // this will remove stale AC,AF attributed from vc - VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); - break; - } - } - - // take the VC with the maxAC and pull the attributes into a modifiable map - if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided - for ( final Map.Entry> p : annotationMap.entrySet() ) { - if ( ! p.getValue().isEmpty() ) { - attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); - } - } - } - - // if at least one record was unfiltered and we want a union, clear all of the filters - if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) - filters.clear(); - - - if ( annotateOrigin ) { // we care about where the call came from - String setValue; - if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered - setValue = MERGE_INTERSECTION; - else if ( nFiltered == VCs.size() ) // everything was filtered out - setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference - setValue = MERGE_REF_IN_ALL; - else { - final LinkedHashSet s = new LinkedHashSet<>(); - for ( final VariantContext vc : VCs ) - if ( vc.isVariant() ) - s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); - setValue = Utils.join("-", s); - } - - if ( setKey != null ) { - attributes.put(setKey, setValue); - if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.put(setKey, setValue); - } - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); - builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); - builder.alleles(alleles); - builder.genotypes(genotypes); - builder.log10PError(log10PError); - if ( anyVCHadFiltersApplied ) { - builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); - } - builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); - if( combineAnnotations ) { - // unfortunately some attributes are just too dangerous to try to combine together - builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); - builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - } - - // Trim the padded bases of all alleles if necessary - final VariantContext merged = builder.make(); - if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); - return merged; - } - - private static final Comparable combineAnnotationValues( final List array ) { - return MathUtils.median(array); // right now we take the median but other options could be explored - } - - /** - * cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible - * @param VCs the list of VCs in which to fill in symbolic alleles - * @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles - * @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele - */ - protected static final List fillInNonRefSymbolicAlleles( final List VCs, final Collection potentialRefVCs ) { - if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); } - if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); } - - final List VCsToReturn = new ArrayList<>(VCs.size()); - boolean containsNonRefSymbolicAllele = false; - VariantContext nonRefVC = null; - for( final VariantContext vc : VCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - containsNonRefSymbolicAllele = true; - } else if ( nonRefVC == null ) { - nonRefVC = vc; - } - if( nonRefVC != null && containsNonRefSymbolicAllele == true ) { - break; // break out so that we don't run over the whole list unnecessarily - } - } - for( final VariantContext vc : potentialRefVCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - containsNonRefSymbolicAllele = true; - VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below - } - } - - if( !containsNonRefSymbolicAllele ) { - return VCs; - } - - for( final VariantContext vc : VCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site - if( nonRefVC != null ) { - final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size()); - int depth = 0; - for( final String sample : vc.getSampleNames() ) { - final Genotype gt = vc.getGenotype(sample); - final ArrayList refAlleles = new ArrayList<>(2); - refAlleles.add(nonRefVC.getReference()); - refAlleles.add(nonRefVC.getReference()); - final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now - depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here - genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make()); - } - VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make()); - } - } else { - VCsToReturn.add(vc); - } - } - - return VCsToReturn; - } - - private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { - final Iterator it1 = alleleSet1.iterator(); - final Iterator it2 = alleleSet2.iterator(); - - while ( it1.hasNext() && it2.hasNext() ) { - final Allele a1 = it1.next(); - final Allele a2 = it2.next(); - if ( ! a1.equals(a2) ) - return true; - } - - // by this point, at least one of the iterators is empty. All of the elements - // we've compared are equal up until this point. But it's possible that the - // sets aren't the same size, which is indicated by the test below. If they - // are of the same size, though, the sets are compatible - return it1.hasNext() || it2.hasNext(); - } - - public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { - final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); - - for ( final Genotype g : genotypes ) { - newGs.add(removePLsAndAD(g)); - } - - return newGs; - } - - /** - * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles - * from the original VariantContext are no longer present. - * - * @param selectedVC the selected (new) VariantContext - * @param originalVC the original VariantContext - * @return a new non-null GenotypesContext - */ - public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { - final int numNewAlleles = selectedVC.getAlleles().size(); - final int numOriginalAlleles = originalVC.getAlleles().size(); - - // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong - if ( numNewAlleles > numOriginalAlleles ) - throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); - - final GenotypesContext oldGs = selectedVC.getGenotypes(); - - // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything - if ( numNewAlleles == numOriginalAlleles ) - return oldGs; - - final GenotypesContext newGs = fixPLsFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); - - return fixADFromSubsettedAlleles(newGs, originalVC, selectedVC.getAlleles()); - } - - /** - * Fix the PLs for the GenotypesContext of a VariantContext that has been subset - * - * @param originalGs the original GenotypesContext - * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use - * @return a new non-null GenotypesContext - */ - static private GenotypesContext fixPLsFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); - - // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); - } - - /** - * Fix the AD for the GenotypesContext of a VariantContext that has been subset - * - * @param originalGs the original GenotypesContext - * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use - * @return a new non-null GenotypesContext - */ - static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - - // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - - // the samples - final List sampleIndices = originalGs.getSampleNamesOrderedByName(); - - // create the new genotypes - for ( int k = 0; k < originalGs.size(); k++ ) { - final Genotype g = originalGs.get(sampleIndices.get(k)); - newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); - } - - return newGTs; - } - - /** - * Fix the AD for the given Genotype - * - * @param genotype the original Genotype - * @param alleleIndexesToUse a bitset describing whether or not to keep a given index - * @param nAllelesToUse how many alleles we are keeping - * @return a non-null Genotype - */ - private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { - // if it ain't broke don't fix it - if ( !genotype.hasAD() ) - return genotype; - - final GenotypeBuilder builder = new GenotypeBuilder(genotype); - - final int[] oldAD = genotype.getAD(); - if ( oldAD.length != alleleIndexesToUse.length ) { - builder.noAD(); - } else { - final int[] newAD = new int[nAllelesToUse]; - int currentIndex = 0; - for ( int i = 0; i < oldAD.length; i++ ) { - if ( alleleIndexesToUse[i] ) - newAD[currentIndex++] = oldAD[i]; - } - builder.AD(newAD); - } - return builder.make(); - } - - static private Allele determineReferenceAllele(List VCs) { - Allele ref = null; - - for ( final VariantContext vc : VCs ) { - final Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); - } - - return ref; - } - - static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { - if ( refAllele.equals(vc.getReference()) ) - return new AlleleMapper(vc); - else { - // we really need to do some work. The refAllele is the longest reference allele seen at this - // start site. So imagine it is: - // - // refAllele: ACGTGA - // myRef: ACGT - // myAlt: A - // - // We need to remap all of the alleles in vc to include the extra GA so that - // myRef => refAllele and myAlt => AGA - // - - Allele myRef = vc.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - -// System.out.printf("Remapping allele at %s%n", vc); -// System.out.printf("ref %s%n", refAllele); -// System.out.printf("myref %s%n", myRef ); -// System.out.printf("extrabases %s%n", new String(extraBases)); - - Map map = new HashMap<>(); - for ( final Allele a : vc.getAlleles() ) { - if ( a.isReference() ) - map.put(a, refAllele); - else { - Allele extended = Allele.extend(a, extraBases); - for ( final Allele b : allAlleles ) - if ( extended.equals(b) ) - extended = b; -// System.out.printf(" Extending %s => %s%n", a, extended); - map.put(a, extended); - } - } - - // debugging -// System.out.printf("mapping %s%n", map); - - return new AlleleMapper(map); - } - } - - public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { - if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - - if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList<>(unsortedVCs); - else { - ArrayList sorted = new ArrayList<>(unsortedVCs); - Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); - return sorted; - } - } - - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { - //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( final Genotype g : oneVC.getGenotypes() ) { - final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); - if ( ! mergedGenotypes.containsSample(name) ) { - // only add if the name is new - Genotype newG = g; - - if ( uniquifySamples || alleleMapping.needsRemapping() ) { - final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); - newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); - } - - mergedGenotypes.add(newG); - } - } - } - - public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { - return uniquify ? sampleName + "." + trackName : sampleName; - } - - /** - * Trim the alleles in inputVC from the reverse direction - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up - */ - public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - return trimAlleles(inputVC, false, true); - } - - /** - * Trim the alleles in inputVC from the forward direction - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up - */ - public static VariantContext forwardTrimAlleles( final VariantContext inputVC ) { - return trimAlleles(inputVC, true, false); - } - - /** - * Trim the alleles in inputVC forward and reverse, as requested - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @param trimForward should we trim up the alleles from the forward direction? - * @param trimReverse should we trim up the alleles from the reverse direction? - * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles - */ - @Ensures("result != null") - public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { - if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); - - if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) - return inputVC; - - // see whether we need to trim common reference base from all alleles - final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; - final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); - final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; - final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); - return vc; - } - - /** - * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and - * the last revTrim bases from the end - * - * @param inputVC a non-null input VC - * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles - * @param revTrim the last revTrim bases of each allele will be clipped off as well - * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles - */ - @Requires({"inputVC != null"}) - @Ensures("result != null") - protected static VariantContext trimAlleles(final VariantContext inputVC, - final int fwdTrimEnd, - final int revTrim) { - if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified - return inputVC; - - final List alleles = new LinkedList<>(); - final Map originalToTrimmedAlleleMap = new HashMap<>(); - - for (final Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); - final Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // now we can recreate new genotypes with trimmed alleles - final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); - final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); - - final int start = inputVC.getStart() + (fwdTrimEnd + 1); - final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - builder.start(start); - builder.stop(start + alleles.get(0).length() - 1); - builder.alleles(alleles); - builder.genotypes(genotypes); - return builder.make(); - } - - @Requires("originalGenotypes != null && alleleMapper != null") - protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { - final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); - - for ( final Genotype genotype : originalGenotypes ) { - final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); - updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); - } - - return updatedGenotypes; - } - - public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong - // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). - if ( a.length() - clipping == 0 ) - return clipping - 1; - - if ( a.length() - clipping <= 0 || a.length() == 0 ) { - stillClipping = false; - } - else if ( ref.length == clipping ) { - return -1; - } - else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { - stillClipping = false; - } - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - - /** - * Clip out any unnecessary bases off the front of the alleles - * - * The VCF spec represents alleles as block substitutions, replacing AC with A for a - * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that - * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. - * This routine finds an offset among all alleles that can be safely trimmed - * off the left of each allele and still represent the same block substitution. - * - * A/C => A/C - * AC/A => AC/A - * ACC/AC => CC/C - * AGT/CAT => AGT/CAT - * /C => /C - * - * @param unclippedAlleles a non-null list of alleles that we want to clip - * @return the offset into the alleles where we can safely clip, inclusive, or - * -1 if no clipping is tolerated. So, if the result is 0, then we can remove - * the first base of every allele. If the result is 1, we can remove the - * second base. - */ - public static int computeForwardClipping(final List unclippedAlleles) { - // cannot clip unless there's at least 1 alt allele - if ( unclippedAlleles.size() <= 1 ) - return -1; - - // we cannot forward clip any set of alleles containing a symbolic allele - int minAlleleLength = Integer.MAX_VALUE; - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - return -1; - minAlleleLength = Math.min(minAlleleLength, a.length()); - } - - final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); - int indexOflastSharedBase = -1; - - // the -1 to the stop is that we can never clip off the right most base - for ( int i = 0; i < minAlleleLength - 1; i++) { - final byte base = firstAlleleBases[i]; - - for ( final Allele allele : unclippedAlleles ) { - if ( allele.getBases()[i] != base ) - return indexOflastSharedBase; - } - - indexOflastSharedBase = i; - } - - return indexOflastSharedBase; - } - - public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getCalledChrCount() == 0 ) - return 0.0; - return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); - } - - public static boolean requiresPaddingBase(final List alleles) { - - // see whether one of the alleles would be null if trimmed through - - for ( final String allele : alleles ) { - if ( allele.isEmpty() ) - return true; - } - - int clipping = 0; - Character currentBase = null; - - while ( true ) { - for ( final String allele : alleles ) { - if ( allele.length() - clipping == 0 ) - return true; - - char myBase = allele.charAt(clipping); - if ( currentBase == null ) - currentBase = myBase; - else if ( currentBase != myBase ) - return false; - } - - clipping++; - currentBase = null; - } - } - - private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap<>(keysToPreserve.size()); - for ( final String key : keysToPreserve ) { - if ( igc.hasAttribute(key) ) - attributes.put(key, igc.getAttribute(key)); - } - return attributes; - } - - /** - * @deprecated use variant context builder version instead - * @param vc the variant context - * @param keysToPreserve the keys to preserve - * @return a pruned version of the original variant context - */ - @Deprecated - public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { - return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); - } - - public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { - final VariantContext vc = builder.make(); - if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); - - // VC info - final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); - - // Genotypes - final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - final GenotypeBuilder gb = new GenotypeBuilder(g); - // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ - gb.noAD().noDP().noPL().noAttributes(); - genotypes.add(gb.make()); - } - - return builder.genotypes(genotypes).attributes(attributes); - } - - public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { - // if all alleles of vc1 are a contained in alleles of vc2, return true - if (!vc1.getReference().equals(vc2.getReference())) - return false; - - for (final Allele a :vc1.getAlternateAlleles()) { - if (!vc2.getAlternateAlleles().contains(a)) - return false; - } - - return true; - } - - public static Map> separateVariantContextsByType( final Collection VCs ) { - if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } - - final HashMap> mappedVCs = new HashMap<>(); - for ( final VariantContext vc : VCs ) { - VariantContext.Type vcType = vc.getType(); - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } - vcType = VariantContext.Type.NO_VARIATION; - } - - // look at previous variant contexts of different type. If: - // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list - // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) - // c) neither: do nothing, just add vc to its own list - boolean addtoOwnList = true; - for (final VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vcType)) - continue; - - if (!mappedVCs.containsKey(type)) - continue; - - List vcList = mappedVCs.get(type); - for (int k=0; k < vcList.size(); k++) { - VariantContext otherVC = vcList.get(k); - if (allelesAreSubset(otherVC,vc)) { - // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list - vcList.remove(k); - // avoid having empty lists - if (vcList.size() == 0) - mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vcType) ) - mappedVCs.put(vcType, new ArrayList()); - mappedVCs.get(vcType).add(otherVC); - break; - } - else if (allelesAreSubset(vc,otherVC)) { - // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own - mappedVCs.get(type).add(vc); - addtoOwnList = false; - break; - } - } - } - if (addtoOwnList) { - if ( !mappedVCs.containsKey(vcType) ) - mappedVCs.put(vcType, new ArrayList()); - mappedVCs.get(vcType).add(vc); - } - } - - return mappedVCs; - } - - public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { - if ( allowedAttributes == null ) - return vc; - - final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - final Map attrs = new HashMap<>(); - for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { - if ( allowedAttributes.contains(attr.getKey()) ) - attrs.put(attr.getKey(), attr.getValue()); - } - newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); - } - - return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); - } - - - protected static class AlleleMapper { - private VariantContext vc = null; - private Map map = null; - public AlleleMapper(VariantContext vc) { this.vc = vc; } - public AlleleMapper(Map map) { this.map = map; } - public boolean needsRemapping() { return this.map != null; } - public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } - - public List remap(List as) { - List newAs = new ArrayList<>(); - for ( final Allele a : as ) { - //System.out.printf(" Remapping %s => %s%n", a, remap(a)); - newAs.add(remap(a)); - } - return newAs; - } - } - - private static class CompareByPriority implements Comparator, Serializable { - List priorityListOfVCs; - public CompareByPriority(List priorityListOfVCs) { - this.priorityListOfVCs = priorityListOfVCs; - } - - private int getIndex(VariantContext vc) { - int i = priorityListOfVCs.indexOf(vc.getSource()); - if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); - return i; - } - - public int compare(VariantContext vc1, VariantContext vc2) { - return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); - } - } - - /** - * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles - * - * @param name the name of the VC - * @param contig the contig for the VC - * @param start the start of the VC - * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the - * alt. Will compute the stop of the VC from the length of the reference allele - * @return a non-null VariantContext - */ - public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List alleleStrings) { - if ( alleleStrings == null || alleleStrings.isEmpty() ) - throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); - - final List alleles = new LinkedList<>(); - final int length = alleleStrings.get(0).length(); - - boolean first = true; - for ( final String alleleString : alleleStrings ) { - alleles.add(Allele.create(alleleString, first)); - first = false; - } - return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); - } - - /** - * Splits the alleles for the provided variant context into its primitive parts. - * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. - * Currently works only for MNPs. - * - * @param vc the non-null VC to split - * @return a non-empty list of VCs split into primitive parts or the original VC otherwise - */ - public static List splitIntoPrimitiveAlleles(final VariantContext vc) { - if ( vc == null ) - throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); - - if ( !vc.isBiallelic() ) - throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); - - // currently only works for MNPs - if ( !vc.isMNP() ) - return Arrays.asList(vc); - - final byte[] ref = vc.getReference().getBases(); - final byte[] alt = vc.getAlternateAllele(0).getBases(); - - if ( ref.length != alt.length ) - throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); - - final List result = new ArrayList<>(ref.length); - - for ( int i = 0; i < ref.length; i++ ) { - - // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) - if ( ref[i] != alt[i] ) { - - // create the ref and alt SNP alleles - final Allele newRefAllele = Allele.create(ref[i], true); - final Allele newAltAllele = Allele.create(alt[i], false); - - // create a new VariantContext with the new SNP alleles - final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); - - // create new genotypes with updated alleles - final Map alleleMap = new HashMap<>(); - alleleMap.put(vc.getReference(), newRefAllele); - alleleMap.put(vc.getAlternateAllele(0), newAltAllele); - final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); - - result.add(newVC.genotypes(newGenotypes).make()); - } - } - - if ( result.isEmpty() ) - result.add(vc); - - return result; - } - - /** - * Are vc1 and 2 equal including their position and alleles? - * @param vc1 non-null VariantContext - * @param vc2 non-null VariantContext - * @return true if vc1 and vc2 are equal, false otherwise - */ - public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { - if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); - if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); - - if ( vc1.getStart() != vc2.getStart() ) return false; - if ( vc1.getEnd() != vc2.getEnd() ) return false; - if ( ! vc1.getChr().equals(vc2.getChr())) return false; - if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; - return true; - } -} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java deleted file mode 100644 index c1e11e2ce..000000000 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ /dev/null @@ -1,513 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting; - -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.apache.log4j.spi.LoggingEvent; -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.commandline.CommandLineUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.crypt.CryptUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.io.IOUtils; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.testng.Assert; -import org.testng.Reporter; -import org.testng.SkipException; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -/** - * - * User: aaron - * Date: Apr 14, 2009 - * Time: 10:24:30 AM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - *

- * Class BaseTest - *

- * This is the base test class for all of our test cases. All test cases should extend from this - * class; it sets up the logger, and resolves the location of directories that we rely on. - */ -@SuppressWarnings("unchecked") -public abstract class BaseTest { - /** our log, which we want to capture anything from org.broadinstitute.sting */ - public static final Logger logger = CommandLineUtils.getStingLogger(); - - public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; - public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; - public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; - //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; - public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; - public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; - public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; - public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; - public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; - public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; - public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; - - public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; - public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; - - public static final String dbsnpDataLocation = GATKDataLocation; - public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; - public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; - public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; - public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; - - public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; - public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; - public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf"; - - public static final String intervalsLocation = GATKDataLocation; - public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; - public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; - - public static final boolean REQUIRE_NETWORK_CONNECTION = false; - private static final String networkTempDirRoot = "/broad/hptmp/"; - private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists(); - private static final String networkTempDir; - private static final File networkTempDirFile; - - private static final String privateTestDirRelative = "private/testdata/"; - public static final String privateTestDir = new File(privateTestDirRelative).getAbsolutePath() + "/"; - protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, ""); - - private static final String publicTestDirRelative = "public/testdata/"; - public static final String publicTestDir = new File(publicTestDirRelative).getAbsolutePath() + "/"; - protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); - - public static final String keysDataLocation = validationDataLocation + "keys/"; - public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; - - public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; - - public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; - public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; - - public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run").equals("run"); - - /** before the class starts up */ - static { - // setup a basic log configuration - CommandLineUtils.configureConsoleLogging(); - - // setup our log layout - PatternLayout layout = new PatternLayout(); - layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n"); - - // now set the layout of all the loggers to our layout - CommandLineUtils.setLayout(logger, layout); - - // Set the Root logger to only output warnings. - logger.setLevel(Level.WARN); - - if (networkTempDirRootExists) { - networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name"))); - networkTempDirFile.deleteOnExit(); - networkTempDir = networkTempDirFile.getAbsolutePath() + "/"; - } else { - networkTempDir = null; - networkTempDirFile = null; - } - - - if ( REQUIRE_NETWORK_CONNECTION ) { - // find our file sources - if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { - logger.fatal("We can't locate the reference directories. Aborting!"); - throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); - } - } - } - - /** - * Simple generic utility class to creating TestNG data providers: - * - * 1: inherit this class, as in - * - * private class SummarizeDifferenceTest extends TestDataProvider { - * public SummarizeDifferenceTest() { - * super(SummarizeDifferenceTest.class); - * } - * ... - * } - * - * Provide a reference to your class to the TestDataProvider constructor. - * - * 2: Create instances of your subclass. Return from it the call to getTests, providing - * the class type of your test - * - * @DataProvider(name = "summaries" - * public Object[][] createSummaries() { - * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); - * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); - * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); - * } - * - * This class magically tracks created objects of this - */ - public static class TestDataProvider { - private static final Map> tests = new HashMap>(); - protected String name; - - /** - * Create a new TestDataProvider instance bound to the class variable C - * @param c - */ - public TestDataProvider(Class c, String name) { - if ( ! tests.containsKey(c) ) - tests.put(c, new ArrayList()); - tests.get(c).add(this); - this.name = name; - } - - public TestDataProvider(Class c) { - this(c, ""); - } - - public void setName(final String name) { - this.name = name; - } - - /** - * Return all of the data providers in the form expected by TestNG of type class C - * @param c - * @return - */ - public static Object[][] getTests(Class c) { - List params2 = new ArrayList(); - for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @Override - public String toString() { - return "TestDataProvider("+name+")"; - } - } - - /** - * test if the file exists - * - * @param file name as a string - * @return true if it exists - */ - public static boolean fileExist(String file) { - File temp = new File(file); - return temp.exists(); - } - - /** - * this appender looks for a specific message in the log4j stream. - * It can be used to verify that a specific message was generated to the logging system. - */ - public static class ValidationAppender extends AppenderSkeleton { - - private boolean foundString = false; - private String targetString = ""; - - public ValidationAppender(String target) { - targetString = target; - } - - @Override - protected void append(LoggingEvent loggingEvent) { - if (loggingEvent.getMessage().equals(targetString)) - foundString = true; - } - - public void close() { - // do nothing - } - - public boolean requiresLayout() { - return false; - } - - public boolean foundString() { - return foundString; - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Prefix of the file. - * @param extension Extension to concat to the end of the file. - * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. - */ - public static File createTempFile(String name, String extension) { - try { - File file = File.createTempFile(name, extension); - file.deleteOnExit(); - return file; - } catch (IOException ex) { - throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex); - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Name of the file. - * @return A file in the network temporary directory with name, which will be deleted after the program exits. - * @throws SkipException when the network is not available. - */ - public static File tryCreateNetworkTempFile(String name) { - if (!networkTempDirRootExists) - throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot); - File file = new File(networkTempDirFile, name); - file.deleteOnExit(); - return file; - } - - /** - * Log this message so that it shows up inline during output as well as in html reports - * - * @param message - */ - public static void log(final String message) { - Reporter.log(message, true); - } - - private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected); - } - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); - } - - public static final void assertEqualsDoubleSmart(final double actual, final double expected) { - assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); - } - - public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { - final Set actualSet = new HashSet(actual); - final Set expectedSet = new HashSet(expected); - Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { - assertEqualsDoubleSmart(actual, expected, tolerance, null); - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { - if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); - else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); - else { - final double delta = Math.abs(actual - expected); - final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual - + " not within tolerance " + tolerance - + (message == null ? "" : "message: " + message)); - } - } - - public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { - Assert.assertNotNull(actual, "VariantContext expected not null"); - Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); - Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); - Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); - Assert.assertEquals(actual.getID(), expected.getID(), "id"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); - - assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); - assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); - assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); - - Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); - if ( expected.hasGenotypes() ) { - assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); - Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); - final Set samples = expected.getSampleNames(); - for ( final String sample : samples ) { - assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); - } - } - } - - public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { - final Iterator actualIT = actual.iterator(); - final Iterator expectedIT = expected.iterator(); - - while ( expectedIT.hasNext() ) { - final VariantContext expectedVC = expectedIT.next(); - if ( expectedVC == null ) - continue; - - VariantContext actualVC; - do { - Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); - actualVC = actualIT.next(); - } while ( actualIT.hasNext() && actualVC == null ); - - if ( actualVC == null ) - Assert.fail("Too few records in actual"); - - assertVariantContextsAreEqual(actualVC, expectedVC); - } - Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); - } - - - public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { - Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); - Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); - Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); - - // filters are the same - Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); - - // inline attributes - Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); - Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); - Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); - Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); - Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); - Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); - Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); - - Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); - Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); - Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); - Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); - - Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); - assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); - Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); - Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); - } - - public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { - Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); - - // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? - //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); - final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); - final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); - for ( int i = 0; i < actualLines.size(); i++ ) { - Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); - } - } - - public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final Pair> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); - final Pair> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); - assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); - assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); - } - - private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { - if ( expected instanceof Double ) { - // must be very tolerant because doubles are being rounded to 2 sig figs - assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); - } else - Assert.assertEquals(actual, expected, "Attribute " + key); - } - - private static void assertAttributesEquals(final Map actual, Map expected) { - final Set expectedKeys = new HashSet(expected.keySet()); - - for ( final Map.Entry act : actual.entrySet() ) { - final Object actualValue = act.getValue(); - if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { - final Object expectedValue = expected.get(act.getKey()); - if ( expectedValue instanceof List ) { - final List expectedList = (List)expectedValue; - Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); - final List actualList = (List)actualValue; - Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); - for ( int i = 0; i < expectedList.size(); i++ ) - assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); - } else - assertAttributeEquals(act.getKey(), actualValue, expectedValue); - } else { - // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); - } - expectedKeys.remove(act.getKey()); - } - - // now expectedKeys contains only the keys found in expected but not in actual, - // and they must all be null - for ( final String missingExpected : expectedKeys ) { - final Object value = expected.get(missingExpected); - Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); - } - } - - private static final boolean isMissing(final Object value) { - if ( value == null ) return true; - else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; - else if ( value instanceof List ) { - // handles the case where all elements are null or the list is empty - for ( final Object elt : (List)value) - if ( elt != null ) - return false; - return true; - } else - return false; - } -} diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java deleted file mode 100644 index 7bd6f7bc4..000000000 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ /dev/null @@ -1,313 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting; - -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.*; -import java.util.Arrays; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/18/11 - * Time: 9:10 AM - * - * Utilities for manipulating the MD5 database of previous results - */ -public class MD5DB { - public static final Logger logger = Logger.getLogger(MD5DB.class); - - /** - * Subdirectory under the ant build directory where we store integration test md5 results - */ - private static final int MAX_RECORDS_TO_READ = 1000000; - private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1; - public static final String LOCAL_MD5_DB_DIR = "integrationtests"; - public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; - - // tracking and emitting a data file of origina and new md5s - private final File MD5MismatchesFile; - private final PrintStream md5MismatchStream; - - public MD5DB() { - this(new File(MD5DB.LOCAL_MD5_DB_DIR + "/md5mismatches.txt")); - } - - public MD5DB(final File MD5MismatchesFile) { - this.MD5MismatchesFile = MD5MismatchesFile; - - ensureMd5DbDirectory(); - - logger.debug("Creating md5 mismatch db at " + MD5MismatchesFile); - try { - md5MismatchStream = new PrintStream(new FileOutputStream(MD5MismatchesFile)); - md5MismatchStream.printf("%s\t%s\t%s%n", "expected", "observed", "test"); - } catch ( FileNotFoundException e ) { - throw new ReviewedStingException("Failed to open md5 mismatch file", e); - } - - } - - public void close() { - if ( md5MismatchStream != null ) { - logger.debug("Closeing md5 mismatch db at " + MD5MismatchesFile); - md5MismatchStream.close(); - } - } - - // ---------------------------------------------------------------------- - // - // MD5 DB stuff - // - // ---------------------------------------------------------------------- - - /** - * Create the MD5 file directories if necessary - */ - private void ensureMd5DbDirectory() { - File dir = new File(LOCAL_MD5_DB_DIR); - if ( ! dir.exists() ) { - System.out.printf("##### Creating MD5 db %s%n", LOCAL_MD5_DB_DIR); - if ( ! dir.mkdir() ) { - // Need to check AGAIN whether the dir exists, because we might be doing multi-process parallelism - // within the same working directory, and another GATK instance may have come along and created the - // directory between the calls to exists() and mkdir() above. - if ( ! dir.exists() ) { - throw new ReviewedStingException("Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR); - } - } - } - } - - /** - * Returns the path to an already existing file with the md5 contents, or valueIfNotFound - * if no such file exists in the db. - * - * @param md5 - * @param valueIfNotFound - * @return - */ - public String getMD5FilePath(final String md5, final String valueIfNotFound) { - // we prefer the global db to the local DB, so match it first - for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { - File f = getFileForMD5(md5, dir); - if ( f.exists() && f.canRead() ) - return f.getPath(); - } - - return valueIfNotFound; - } - - /** - * Utility function that given a file's md5 value and the path to the md5 db, - * returns the canonical name of the file. For example, if md5 is XXX and db is YYY, - * this will return YYY/XXX.integrationtest - * - * @param md5 - * @param dbPath - * @return - */ - private File getFileForMD5(final String md5, final String dbPath) { - final String basename = String.format("%s.integrationtest", md5); - return new File(dbPath + "/" + basename); - } - - /** - * Copies the results file with md5 value to its canonical file name and db places - * - * @param md5 - * @param resultsFile - */ - private void updateMD5Db(final String md5, final File resultsFile) { - copyFileToDB(getFileForMD5(md5, LOCAL_MD5_DB_DIR), resultsFile); - copyFileToDB(getFileForMD5(md5, GLOBAL_MD5_DB_DIR), resultsFile); - } - - /** - * Low-level utility routine that copies resultsFile to dbFile - * @param dbFile - * @param resultsFile - */ - private void copyFileToDB(File dbFile, final File resultsFile) { - if ( ! dbFile.exists() ) { - // the file isn't already in the db, copy it over - System.out.printf("##### Updating MD5 file: %s%n", dbFile.getPath()); - try { - FileUtils.copyFile(resultsFile, dbFile); - } catch ( IOException e ) { - System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); - } - } else { - //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); - } - } - - /** - * Returns the byte[] of the entire contents of file, for md5 calculations - * @param file - * @return - * @throws IOException - */ - private static byte[] getBytesFromFile(File file) throws IOException { - InputStream is = new FileInputStream(file); - - // Get the size of the file - long length = file.length(); - - if (length > Integer.MAX_VALUE) { - // File is too large - } - - // Create the byte array to hold the data - byte[] bytes = new byte[(int) length]; - - // Read in the bytes - int offset = 0; - int numRead = 0; - while (offset < bytes.length - && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) { - offset += numRead; - } - - // Ensure all the bytes have been read in - if (offset < bytes.length) { - throw new IOException("Could not completely read file " + file.getName()); - } - - // Close the input stream and return bytes - is.close(); - return bytes; - } - - public static class MD5Match { - public final String actualMD5, expectedMD5; - public final String failMessage; - public final String diffEngineOutput; - public final boolean failed; - - public MD5Match(final String actualMD5, final String expectedMD5, final String failMessage, final String diffEngineOutput, final boolean failed) { - this.actualMD5 = actualMD5; - this.expectedMD5 = expectedMD5; - this.failMessage = failMessage; - this.diffEngineOutput = diffEngineOutput; - this.failed = failed; - } - } - - /** - * Tests a file MD5 against an expected value, returning an MD5Match object containing a description of the - * match or mismatch. In case of a mismatch, outputs a description of the mismatch to various log files/streams. - * - * NOTE: This function WILL NOT throw an exception if the MD5s are different. - * - * @param testName Name of the test. - * @param testClassName Name of the class that contains the test. - * @param resultsFile File to MD5. - * @param expectedMD5 Expected MD5 value. - * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. - * @return an MD5Match object containing a description of the match/mismatch. Will have its "failed" field set - * to true if there was a mismatch (unless we're using the "parameterize" argument) - */ - public MD5Match testFileMD5(final String testName, final String testClassName, final File resultsFile, final String expectedMD5, final boolean parameterize) { - final String actualMD5 = calculateFileMD5(resultsFile); - String diffEngineOutput = ""; - String failMessage = ""; - boolean failed = false; - - // copy md5 to integrationtests - updateMD5Db(actualMD5, resultsFile); - - if (parameterize || expectedMD5.equals("")) { - BaseTest.log(String.format("PARAMETERIZATION: file %s has md5 = %s", resultsFile, actualMD5)); - } else if ( ! expectedMD5.equals(actualMD5) ) { - failed = true; - failMessage = String.format("%s:%s has mismatching MD5s: expected=%s observed=%s", testClassName, testName, expectedMD5, actualMD5); - diffEngineOutput = logMD5MismatchAndGetDiffEngineOutput(testName, testClassName, expectedMD5, actualMD5); - } - - return new MD5Match(actualMD5, expectedMD5, failMessage, diffEngineOutput, failed); - } - - /** - * Calculates the MD5 for the specified file and returns it as a String - * - * @param file file whose MD5 to calculate - * @return file's MD5 in String form - * @throws RuntimeException if the file could not be read - */ - public String calculateFileMD5( final File file ) { - try { - return Utils.calcMD5(getBytesFromFile(file)); - } - catch ( Exception e ) { - throw new RuntimeException("Failed to read bytes from file: " + file + " for MD5 calculation", e); - } - } - - /** - * Logs a description (including diff engine output) of the MD5 mismatch between the expectedMD5 - * and actualMD5 to a combination of BaseTest.log(), the md5MismatchStream, and stdout, then returns - * the diff engine output. - * - * @param testName name of the test that generated the mismatch - * @param testClassName name of the class containing the test that generated the mismatch - * @param expectedMD5 the MD5 we were expecting from this test - * @param actualMD5 the MD5 we actually calculated from the test output - * @return the diff engine output produced while logging the description of the mismatch - */ - private String logMD5MismatchAndGetDiffEngineOutput(final String testName, final String testClassName, final String expectedMD5, final String actualMD5) { - System.out.printf("##### Test %s:%s is going to fail #####%n", testClassName, testName); - String pathToExpectedMD5File = getMD5FilePath(expectedMD5, "[No DB file found]"); - String pathToFileMD5File = getMD5FilePath(actualMD5, "[No DB file found]"); - BaseTest.log(String.format("expected %s", expectedMD5)); - BaseTest.log(String.format("calculated %s", actualMD5)); - BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File)); - - md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, actualMD5, testName); - md5MismatchStream.flush(); - - // inline differences - String diffEngineOutput = ""; - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final PrintStream ps = new PrintStream(baos); - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false); - boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); - if ( success ) { - diffEngineOutput = baos.toString(); - BaseTest.log(diffEngineOutput); - System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", - pathToExpectedMD5File, pathToFileMD5File); - } - ps.close(); - - return diffEngineOutput; - } -} diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java deleted file mode 100644 index f08e04c56..000000000 --- a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java +++ /dev/null @@ -1,949 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.apache.commons.io.FileUtils; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.EnumSet; -/** - * Test suite for the parsing engine. - */ -public class ParsingEngineUnitTest extends BaseTest { - /** we absolutely cannot have this file existing, or we'll fail the UnitTest */ - private final static String NON_EXISTANT_FILENAME_VCF = "this_file_should_not_exist_on_disk_123456789.vcf"; - private ParsingEngine parsingEngine; - - @BeforeMethod - public void setUp() { - parsingEngine = new ParsingEngine(null); - RodBinding.resetNameCounter(); - } - - private class InputFileArgProvider { - @Argument(fullName="input_file",doc="input file",shortName="I") - public String inputFile; - } - - @Test - public void shortNameArgumentTest() { - final String[] commandLine = new String[] {"-I","na12878.bam"}; - - parsingEngine.addArgumentSource( InputFileArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - InputFileArgProvider argProvider = new InputFileArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); - } - - @Test - public void multiCharShortNameArgumentTest() { - final String[] commandLine = new String[] {"-out","out.txt"}; - - parsingEngine.addArgumentSource( MultiCharShortNameArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - MultiCharShortNameArgProvider argProvider = new MultiCharShortNameArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.outputFile,"out.txt","Argument is not correctly initialized"); - } - - - private class MultiCharShortNameArgProvider { - @Argument(shortName="out", doc="output file") - public String outputFile; - } - - @Test - public void longNameArgumentTest() { - final String[] commandLine = new String[] {"--input_file", "na12878.bam"}; - - parsingEngine.addArgumentSource( InputFileArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - InputFileArgProvider argProvider = new InputFileArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); - } - - @Test - public void extraWhitespaceTest() { - final String[] commandLine = new String[] {" --input_file ", "na12878.bam"}; - - parsingEngine.addArgumentSource( InputFileArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - InputFileArgProvider argProvider = new InputFileArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); - } - - @Test - public void primitiveArgumentTest() { - final String[] commandLine = new String[] {"--foo", "5"}; - - parsingEngine.addArgumentSource( PrimitiveArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - PrimitiveArgProvider argProvider = new PrimitiveArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.foo, 5, "Argument is not correctly initialized"); - } - - @Test(expectedExceptions=MissingArgumentValueException.class) - public void primitiveArgumentNoValueTest() { - final String[] commandLine = new String[] {"--foo"}; - - parsingEngine.addArgumentSource( PrimitiveArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - PrimitiveArgProvider argProvider = new PrimitiveArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.foo, 5, "Argument is not correctly initialized"); - } - - private class PrimitiveArgProvider { - @Argument(doc="simple integer") - int foo; - } - - @Test - public void flagTest() { - final String[] commandLine = new String[] {"--all_loci"}; - - parsingEngine.addArgumentSource( AllLociArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - AllLociArgProvider argProvider = new AllLociArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertTrue(argProvider.allLoci,"Argument is not correctly initialized"); - } - - private class AllLociArgProvider { - @Argument(fullName="all_loci",shortName="A", doc="all loci") - public boolean allLoci = false; - } - - @Test - public void arrayTest() { - final String[] commandLine = new String[] {"-I", "foo.txt", "--input_file", "bar.txt"}; - - parsingEngine.addArgumentSource( MultiValueArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - MultiValueArgProvider argProvider = new MultiValueArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.inputFile.length, 2, "Argument array is of incorrect length"); - Assert.assertEquals(argProvider.inputFile[0],"foo.txt","1st filename is incorrect"); - Assert.assertEquals(argProvider.inputFile[1],"bar.txt","2nd filename is incorrect"); - } - - private class MultiValueArgProvider { - @Argument(fullName="input_file",shortName="I", doc="input file") - public String[] inputFile; - } - - @Test - public void enumTest() { - final String[] commandLine = new String[] { "--test_enum", "TWO" }; - - parsingEngine.addArgumentSource( EnumArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - EnumArgProvider argProvider = new EnumArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.testEnum, TestEnum.TWO, "Enum value is not correct"); - } - - @Test - public void enumMixedCaseTest() { - final String[] commandLine = new String[] { "--test_enum", "oNe" }; - - parsingEngine.addArgumentSource( EnumArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - EnumArgProvider argProvider = new EnumArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.testEnum, TestEnum.ONE, "Enum value is not correct"); - } - - @Test - public void enumDefaultTest() { - final String[] commandLine = new String[] {}; - - parsingEngine.addArgumentSource( EnumArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - EnumArgProvider argProvider = new EnumArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.testEnum, TestEnum.THREE, "Enum value is not correct"); - } - - public enum TestEnum { ONE, TWO, THREE } - - private class EnumArgProvider { - @Argument(fullName="test_enum",shortName="ti",doc="test enum",required=false) - public TestEnum testEnum = TestEnum.THREE; - } - - @Test - public void typedCollectionTest() { - final String[] commandLine = new String[] { "-N","2","-N","4","-N","6","-N","8","-N","10" }; - - parsingEngine.addArgumentSource( IntegerListArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - IntegerListArgProvider argProvider = new IntegerListArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertNotNull(argProvider.integers, "Argument array is null"); - Assert.assertEquals(argProvider.integers.size(), 5, "Argument array is of incorrect length"); - Assert.assertEquals(argProvider.integers.get(0).intValue(), 2, "1st integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(1).intValue(), 4, "2nd integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(2).intValue(), 6, "3rd integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(3).intValue(), 8, "4th integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(4).intValue(), 10, "5th integer is incorrect"); - } - - private class IntegerListArgProvider { - @Argument(fullName="integer_list",shortName="N",doc="integer list") - public List integers; - } - - @Test - public void untypedCollectionTest() { - final String[] commandLine = new String[] { "-N","2","-N","4","-N","6","-N","8","-N","10" }; - - parsingEngine.addArgumentSource( UntypedListArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - UntypedListArgProvider argProvider = new UntypedListArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertNotNull(argProvider.integers, "Argument array is null"); - Assert.assertEquals(argProvider.integers.size(), 5, "Argument array is of incorrect length"); - Assert.assertEquals(argProvider.integers.get(0), "2", "1st integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(1), "4", "2nd integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(2), "6", "3rd integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(3), "8", "4th integer is incorrect"); - Assert.assertEquals(argProvider.integers.get(4), "10", "5th integer is incorrect"); - } - - private class UntypedListArgProvider { - @Argument(fullName="untyped_list",shortName="N", doc="untyped list") - public List integers; - } - - @Test(expectedExceptions=MissingArgumentException.class) - public void requiredArgTest() { - final String[] commandLine = new String[0]; - - parsingEngine.addArgumentSource( RequiredArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - private class RequiredArgProvider { - @Argument(required=true,doc="value") - public Integer value; - } - - @Test - public void defaultValueTest() { - // First try getting the default. - String[] commandLine = new String[0]; - - parsingEngine.addArgumentSource( DefaultValueArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - DefaultValueArgProvider argProvider = new DefaultValueArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.value.intValue(), 42, "Default value is not correctly initialized"); - - // Then try to override it. - commandLine = new String[] { "--value", "27" }; - - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.value.intValue(), 27, "Default value is not correctly initialized"); - } - - private class DefaultValueArgProvider { - @Argument(doc="value",required=false) - public Integer value = 42; - } - - @Test - public void disableValidationOfRequiredArgTest() { - final String[] commandLine = new String[0]; - - parsingEngine.addArgumentSource( RequiredArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate( EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument) ); - - RequiredArgProvider argProvider = new RequiredArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider ); - - Assert.assertNull(argProvider.value, "Value should have remain unset"); - } - - @Test - public void unrequiredArgTest() { - final String[] commandLine = new String[0]; - - parsingEngine.addArgumentSource( UnrequiredArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - UnrequiredArgProvider argProvider = new UnrequiredArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertNull(argProvider.value, "Value was unrequired and unspecified; contents should be null"); - } - - private class UnrequiredArgProvider { - @Argument(required=false,doc="unrequired value") - public Integer value; - } - - @Test(expectedExceptions=InvalidArgumentException.class) - public void invalidArgTest() { - final String[] commandLine = new String[] { "--foo" }; - - parsingEngine.addArgumentSource( UnrequiredArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - @Test(expectedExceptions= ReviewedStingException.class) - public void duplicateLongNameTest() { - parsingEngine.addArgumentSource( DuplicateLongNameProvider.class ); - } - - private class DuplicateLongNameProvider { - @Argument(fullName="myarg",doc="my arg") - public Integer foo; - - @Argument(fullName="myarg", doc="my arg") - public Integer bar; - } - - @Test(expectedExceptions= ReviewedStingException.class) - public void duplicateShortNameTest() { - parsingEngine.addArgumentSource( DuplicateShortNameProvider.class ); - } - - - private class DuplicateShortNameProvider { - @Argument(shortName="myarg", doc="my arg") - public Integer foo; - - @Argument(shortName="myarg", doc="my arg") - public Integer bar; - } - - @Test(expectedExceptions=UnmatchedArgumentException.class) - public void missingArgumentNameTest() { - final String[] commandLine = new String[] {"foo.txt"}; - - parsingEngine.addArgumentSource( NoArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - private class NoArgProvider { - - } - - @Test(expectedExceptions=UnmatchedArgumentException.class) - public void extraValueTest() { - final String[] commandLine = new String[] {"-I", "foo.txt", "bar.txt"}; - - parsingEngine.addArgumentSource( InputFileArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - @Test(expectedExceptions=MissingArgumentException.class) - public void multipleInvalidArgTest() { - final String[] commandLine = new String[] {"-N1", "-N2", "-N3"}; - - parsingEngine.addArgumentSource( RequiredArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - @Test(expectedExceptions=TooManyValuesForArgumentException.class) - public void invalidArgCountTest() { - final String[] commandLine = new String[] {"--value","1","--value","2","--value","3"}; - - parsingEngine.addArgumentSource( RequiredArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - @Test - public void packageProtectedArgTest() { - final String[] commandLine = new String[] {"--foo", "1"}; - - parsingEngine.addArgumentSource( PackageProtectedArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - PackageProtectedArgProvider argProvider = new PackageProtectedArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - - Assert.assertEquals(argProvider.foo.intValue(), 1, "Argument is not correctly initialized"); - } - - private class PackageProtectedArgProvider { - @Argument(doc="foo") - Integer foo; - } - - @Test - public void derivedArgTest() { - final String[] commandLine = new String[] {"--bar", "5"}; - - parsingEngine.addArgumentSource( DerivedArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - DerivedArgProvider argProvider = new DerivedArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - - Assert.assertEquals(argProvider.bar.intValue(), 5, "Argument is not correctly initialized"); - } - - private class DerivedArgProvider extends BaseArgProvider { - } - - private class BaseArgProvider { - @Argument(doc="bar") - public Integer bar; - } - - @Test - public void correctDefaultArgNameTest() { - parsingEngine.addArgumentSource( CamelCaseArgProvider.class ); - - DefinitionMatcher matcher = ArgumentDefinitions.FullNameDefinitionMatcher; - ArgumentDefinition definition = parsingEngine.argumentDefinitions.findArgumentDefinition("myarg", matcher); - - Assert.assertNotNull(definition, "Invalid default argument name assigned"); - } - - @SuppressWarnings("unused") - private class CamelCaseArgProvider { - @Argument(doc="my arg") - Integer myArg; - } - - @Test(expectedExceptions=UnmatchedArgumentException.class) - public void booleanWithParameterTest() { - final String[] commandLine = new String[] {"--mybool", "true"}; - - parsingEngine.addArgumentSource( BooleanArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - @SuppressWarnings("unused") - private class BooleanArgProvider { - @Argument(doc="my bool") - boolean myBool; - } - - @Test - public void validParseForAnalysisTypeTest() { - final String[] commandLine = new String[] {"--analysis_type", "Pileup" }; - - parsingEngine.addArgumentSource( AnalysisTypeArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate( EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument) ); - - AnalysisTypeArgProvider argProvider = new AnalysisTypeArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.Analysis_Name,"Pileup","Argument is not correctly initialized"); - } - - private class AnalysisTypeArgProvider { - @Argument(fullName="analysis_type", shortName="T", doc="Type of analysis to run") - public String Analysis_Name = null; - } - - @Test(expectedExceptions=TooManyValuesForArgumentException.class) - public void invalidParseForAnalysisTypeTest() { - final String[] commandLine = new String[] {"--analysis_type", "Pileup", "-T", "CountReads" }; - - parsingEngine.addArgumentSource( AnalysisTypeArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate( EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument) ); - } - - @Test(expectedExceptions=ArgumentsAreMutuallyExclusiveException.class) - public void mutuallyExclusiveArgumentsTest() { - // Passing only foo should work fine... - String[] commandLine = new String[] {"--foo","5"}; - - parsingEngine.addArgumentSource( MutuallyExclusiveArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - MutuallyExclusiveArgProvider argProvider = new MutuallyExclusiveArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.foo.intValue(), 5, "Argument is not correctly initialized"); - - // But when foo and bar come together, danger! - commandLine = new String[] {"--foo","5","--bar","6"}; - - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - @SuppressWarnings("unused") - private class MutuallyExclusiveArgProvider { - @Argument(doc="foo",exclusiveOf="bar") - Integer foo; - - @Argument(doc="bar",required=false) - Integer bar; - } - - @Test(expectedExceptions=InvalidArgumentValueException.class) - public void argumentValidationTest() { - // Passing only foo should work fine... - String[] commandLine = new String[] {"--value","521"}; - - parsingEngine.addArgumentSource( ValidatingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ValidatingArgProvider argProvider = new ValidatingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.value.intValue(), 521, "Argument is not correctly initialized"); - - // Try some invalid arguments - commandLine = new String[] {"--value","foo"}; - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - } - - private class ValidatingArgProvider { - @Argument(doc="value",validation="\\d+") - Integer value; - } - - @Test - public void argumentCollectionTest() { - String[] commandLine = new String[] { "--value", "5" }; - - parsingEngine.addArgumentSource( ArgumentCollectionProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ArgumentCollectionProvider argProvider = new ArgumentCollectionProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - - Assert.assertEquals(argProvider.rap.value.intValue(), 5, "Argument is not correctly initialized"); - } - - private class ArgumentCollectionProvider { - @ArgumentCollection - RequiredArgProvider rap = new RequiredArgProvider(); - } - - @Test(expectedExceptions= ReviewedStingException.class) - public void multipleArgumentCollectionTest() { - parsingEngine.addArgumentSource( MultipleArgumentCollectionProvider.class ); - } - - @SuppressWarnings("unused") - private class MultipleArgumentCollectionProvider { - @ArgumentCollection - RequiredArgProvider rap1 = new RequiredArgProvider(); - @ArgumentCollection - RequiredArgProvider rap2 = new RequiredArgProvider(); - } - - // -------------------------------------------------------------------------------- - // - // Tests of the RodBinding system - // - // -------------------------------------------------------------------------------- - - private class SingleRodBindingArgProvider { - @Input(fullName="binding", shortName="V", required=true) - public RodBinding binding; - } - - @Test - public void basicRodBindingArgumentTest() { - final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; - - parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - } - - private class ShortNameOnlyRodBindingArgProvider { - @Input(shortName="short", required=false) - public RodBinding binding; // = RodBinding.makeUnbound(Feature.class); - } - - @Test - public void shortNameOnlyRodBindingArgumentTest() { - final String[] commandLine = new String[] {"-short:vcf",NON_EXISTANT_FILENAME_VCF}; - - parsingEngine.addArgumentSource( ShortNameOnlyRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ShortNameOnlyRodBindingArgProvider argProvider = new ShortNameOnlyRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - } - - private class OptionalRodBindingArgProvider { - @Input(fullName="binding", shortName="V", required=false) - public RodBinding binding; - - @Input(fullName="bindingNull", shortName="VN", required=false) - public RodBinding bindingNull = null; - } - - @Test - public void optionalRodBindingArgumentTest() { - final String[] commandLine = new String[] {}; - - parsingEngine.addArgumentSource( OptionalRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - OptionalRodBindingArgProvider argProvider = new OptionalRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertNotNull(argProvider.binding, "Default value not applied corrected to RodBinding"); - Assert.assertEquals(argProvider.binding.getName(), RodBinding.UNBOUND_VARIABLE_NAME, "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), RodBinding.UNBOUND_SOURCE, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.isBound(), false, "Bound() isn't returning its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); - - Assert.assertNotNull(argProvider.bindingNull, "Default value not applied corrected to RodBinding"); - Assert.assertEquals(argProvider.bindingNull.getName(), RodBinding.UNBOUND_VARIABLE_NAME, "Name isn't set properly"); - Assert.assertEquals(argProvider.bindingNull.getSource(), RodBinding.UNBOUND_SOURCE, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.bindingNull.getType(), VariantContext.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.bindingNull.isBound(), false, "Bound() isn't returning its expected value"); - Assert.assertEquals(argProvider.bindingNull.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); - } - - @Test(expectedExceptions = UserException.class) - public void rodBindingArgumentTestMissingType() { - final String[] commandLine = new String[] {"-V",NON_EXISTANT_FILENAME_VCF}; - - parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - } - - @Test(expectedExceptions = UserException.class) - public void rodBindingArgumentTestTooManyTags() { - final String[] commandLine = new String[] {"-V:x,y,z",NON_EXISTANT_FILENAME_VCF}; - - parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - } - - private class VariantContextRodBindingArgProvider { - @Input(fullName = "binding", shortName="V") - public RodBinding binding; - } - - @Test - public void variantContextBindingArgumentTest() { - final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; - - parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - } - - private class ListRodBindingArgProvider { - @Input(fullName = "binding", shortName="V", required=false) - public List> bindings; - } - - @Test - public void listRodBindingArgumentTest() { - final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; - - parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.bindings.size(), 1, "Unexpected number of bindings"); - RodBinding binding = argProvider.bindings.get(0); - Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly"); - Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value"); - Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - } - - @Test - public void listRodBindingArgumentTest2Args() { - final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF, "-V:vcf", "bar.vcf"}; - - parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.bindings.size(), 2, "Unexpected number of bindings"); - - RodBinding binding = argProvider.bindings.get(0); - Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly"); - Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value"); - Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - - RodBinding binding2 = argProvider.bindings.get(1); - Assert.assertEquals(binding2.getName(), "binding2", "Name isn't set properly"); - Assert.assertEquals(binding2.getSource(), "bar.vcf", "Source isn't set to its expected value"); - Assert.assertEquals(binding2.getType(), Feature.class, "Type isn't set to its expected value"); - Assert.assertEquals(binding2.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - } - - @Test - public void listRodBindingArgumentTest0Args() { - final String[] commandLine = new String[] {}; - - parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertNull(argProvider.bindings, "Bindings were not null"); - } - - @Test - public void listRodBindingArgumentTestExplicitlyNamed() { - final String[] commandLine = new String[] {"-V:foo,vcf",NON_EXISTANT_FILENAME_VCF, "-V:foo,vcf", "bar.vcf"}; - - parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.bindings.size(), 2, "Unexpected number of bindings"); - Assert.assertEquals(argProvider.bindings.get(0).getName(), "foo", "Name isn't set properly"); - Assert.assertEquals(argProvider.bindings.get(1).getName(), "foo2", "Name isn't set properly"); - } - - private final static String HISEQ_VCF = privateTestDir + "HiSeq.10000.vcf"; - private final static String TRANCHES_FILE = privateTestDir + "tranches.6.txt"; - - @Test - public void variantContextBindingTestDynamicTyping1() { - final String[] commandLine = new String[] {"-V", HISEQ_VCF}; - - parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); - } - - @Test - public void variantContextBindingTestDynamicTypingNameAsSingleArgument() { - final String[] commandLine = new String[] {"-V:name", HISEQ_VCF}; - - parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); - } - - @Test() - public void variantContextBindingTestDynamicTypingTwoTagsPassing() { - final String[] commandLine = new String[] {"-V:name,vcf", HISEQ_VCF}; - - parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - - Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 2, "Tags aren't correctly set"); - } - - @Test() - public void variantContextBindingTestDynamicTypingTwoTagsCausingTypeFailure() { - final String[] commandLine = new String[] {"-V:name,beagle", HISEQ_VCF}; - - parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - - Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); - Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getTribbleType(), "beagle", "Type isn't set to its expected value"); - Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 2, "Tags aren't correctly set"); - } - - @Test(expectedExceptions = UserException.class) - public void variantContextBindingTestDynamicTypingUnknownTribbleType() { - final String[] commandLine = new String[] {"-V", TRANCHES_FILE}; - - parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); - parsingEngine.parse( commandLine ); - parsingEngine.validate(); - - VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); - parsingEngine.loadArgumentsIntoObject( argProvider ); - } - - @Test - public void argumentListTest() throws IOException { - File argsFile = BaseTest.createTempFile("args.", ".list"); - try { - FileUtils.write(argsFile, "-I na12878.bam"); - final String[] commandLine = new String[] {"-args", argsFile.getPath()}; - parsingEngine.addArgumentSource(InputFileArgProvider.class); - parsingEngine.parse(commandLine); - parsingEngine.validate(); - - InputFileArgProvider argProvider = new InputFileArgProvider(); - parsingEngine.loadArgumentsIntoObject(argProvider); - - Assert.assertEquals(argProvider.inputFile, "na12878.bam", "Argument is not correctly initialized"); - } finally { - FileUtils.deleteQuietly(argsFile); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java deleted file mode 100644 index 84bc6e080..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ /dev/null @@ -1,248 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.ArgumentException; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.*; - -/** - * Tests selected functionality in the GenomeAnalysisEngine class - */ -public class GenomeAnalysisEngineUnitTest extends BaseTest { - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - final File relativePathToBAMFile = new File("public/testdata/exampleBAM.bam"); - final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); - samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test - public void testEmptyIntervalSetHandling() throws Exception { - GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); - - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - testEngine.setWalker(new PrintReads()); - testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); - - testEngine.validateSuppliedIntervals(); - } - - @Test - public void testLoadWellFormedSampleRenameMapFile() throws IOException { - final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", - "/foo/bar/second.bam newSample2", - "/foo/bar2/third.bam newSample3")); - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - final Map renameMap = engine.loadSampleRenameMap(mapFile); - - Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); - - final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); - while ( expectedResultsIterator.hasNext() ) { - final String expectedKey = expectedResultsIterator.next(); - final String expectedValue = expectedResultsIterator.next(); - - Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey)); - Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey); - } - } - - @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") - public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { - final List tests = new ArrayList(); - - tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", - new File("/foo/bar/nonexistent")}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", - createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", - createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", - createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", - createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", - "/path/to/dupe.bam newSample2"))}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) - public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { - logger.info("Executing test " + testName); - - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - final Map renameMap = engine.loadSampleRenameMap(mapFile); - } - - private File createTestSampleRenameMapFile( final List contents ) throws IOException { - final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); - final PrintWriter writer = new PrintWriter(mapFile); - - for ( final String line : contents ) { - writer.println(line); - } - writer.close(); - - return mapFile; - } - - /////////////////////////////////////////////////// - // Test the ReadTransformer ordering enforcement // - /////////////////////////////////////////////////// - - public static class TestReadTransformer extends ReadTransformer { - - private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; - private boolean enabled; - - protected TestReadTransformer(final OrderingConstraint orderingConstraint) { - this.orderingConstraint = orderingConstraint; - enabled = true; - } - - // need this because PackageUtils will pick up this class as a possible ReadTransformer - protected TestReadTransformer() { - enabled = false; - } - - @Override - public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } - - @Override - public boolean enabled() { return enabled; } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } - - } - - @DataProvider(name = "ReadTransformerData") - public Object[][] makeReadTransformerData() { - List tests = new ArrayList(); - - for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { - for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { - for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { - tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ReadTransformerData") - public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { - - final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - final List readTransformers = new ArrayList(3); - readTransformers.add(new TestReadTransformer(oc1)); - readTransformers.add(new TestReadTransformer(oc2)); - readTransformers.add(new TestReadTransformer(oc3)); - - final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || - numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; - - try { - testEngine.setReadTransformers(readTransformers); - - Assert.assertFalse(shouldThrowException); - Assert.assertEquals(testEngine.getReadTransformers().size(), 3); - - Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); - Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); - Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); - Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); - } catch (UserException.IncompatibleReadFiltersException e) { - Assert.assertTrue(shouldThrowException); - } - } - - private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { - int count = 0; - for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { - if ( constraint == target ) - count++; - } - return count; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java deleted file mode 100644 index 52285fb2e..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ /dev/null @@ -1,213 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.*; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import static org.testng.Assert.*; - -/** - *

- * Class SAMDataSourceUnitTest - *

- * The test of the SAMBAM simple data source. - */ -public class SAMDataSourceUnitTest extends BaseTest { - - // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource - - private List readers; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - readers = new ArrayList(); - - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterMethod - public void undoForEachTest() { - seq = null; - readers.clear(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testLinearBreakIterateAll() { - logger.warn("Executing testLinearBreakIterateAll"); - - // setup the data - readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - SAMFileReader.ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - - Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); - logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); - logger.debug("count = " + count); - StingSAMIterator datum = data.seek(sh); - - // for the first couple of shards make sure we can see the reads - if (count < 5) { - for (SAMRecord r : datum) { - } - readCount++; - } - datum.close(); - - // if we're over 100 shards, break out - if (count > 100) { - break; - } - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - } - - /** Test that we clear program records when requested */ - @Test - public void testRemoveProgramRecords() { - logger.warn("Executing testRemoveProgramRecords"); - - // setup the data - readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags())); - - // use defaults - SAMDataSource data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - SAMFileReader.ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - - List defaultProgramRecords = data.getHeader().getProgramRecords(); - assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor"); - - boolean removeProgramRecords = false; - data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - SAMFileReader.ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - Collections.emptyList(), - false, - (byte) -1, - removeProgramRecords, - false, - null); - - List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); - assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); - - removeProgramRecords = true; - data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - SAMFileReader.ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - Collections.emptyList(), - false, - (byte) -1, - removeProgramRecords, - false, - null); - - List doRemoveProgramRecords = data.getHeader().getProgramRecords(); - assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java deleted file mode 100644 index 6e908a3bf..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ /dev/null @@ -1,235 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - - -/** - * Basic unit test for AlleleBiasedDownsamplingUtils - */ -public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { - - - @Test - public void testSmartDownsampling() { - - final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; - final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; - - // no contamination, no removal - testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // hom sample, het contaminant, different alleles - testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // hom sample, hom contaminant, different alleles - testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // het sample, het contaminant, different alleles - testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // het sample, hom contaminant, different alleles - testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // hom sample, het contaminant, overlapping alleles - final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; - testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - - // hom sample, hom contaminant, overlapping alleles - testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); - - // het sample, het contaminant, overlapping alleles - testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); - testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // het sample, hom contaminant, overlapping alleles - testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - } - - private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, - final int pileupSize, final int[] initialCounts, final int[] targetCounts) { - - final int[] actualCounts = initialCounts.clone(); - actualCounts[0] += addA; - actualCounts[1] += addC; - actualCounts[2] += addG; - actualCounts[3] += addT; - - final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); - Assert.assertTrue(countsAreEqual(results, targetCounts)); - } - - private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { - for ( int i = 0; i < 4; i++ ) { - if ( counts1[i] != counts2[i] ) - return false; - } - return true; - } - - @DataProvider(name = "BiasedDownsamplingTest") - public Object[][] makeBiasedDownsamplingTest() { - final List tests = new LinkedList(); - - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - - for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) { - for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) { - for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) { - if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount ) - continue; - for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { - if ( toRemove <= originalNormalCount + originalReducedCount ) - tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove}); - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BiasedDownsamplingTest") - public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) { - - final LinkedList elements = new LinkedList(); - for ( int i = 0; i < originalNormalCount; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); - elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); - } - if ( originalReducedCount > 0 ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); - read.setReducedReadCountsTag(new int[]{originalReducedCount}); - elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); - } - - final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove); - int pileupCount = 0; - for ( final PileupElement pe : elements ) // reduced reads may have gotten modified - pileupCount += pe.getRepresentativeCount(); - for ( final PileupElement pe : result ) - pileupCount -= pe.getRepresentativeCount(); - - Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove); - } - - @Test - public void testLoadContaminationFileDetails(){ - Logger logger=org.apache.log4j.Logger.getRootLogger(); - - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); - - Map Contam1=new HashMap(); - Set Samples1=new HashSet(); - - Contam1.put("NA11918",0.15); - Samples1.addAll(Contam1.keySet()); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - - Contam1.put("NA12842",0.13); - Samples1.addAll(Contam1.keySet()); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - - Samples1.add("DUMMY"); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - } - - private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ - Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); - Assert.assertTrue(loadedMap.equals(map)); - } - - @DataProvider(name = "goodContaminationFiles") - public Integer[][] goodContaminationFiles() { - return new Integer[][]{ - {1, 2}, - {2, 3}, - {3, 2}, - {4, 2}, - {5, 3}, - {6, 2}, - {7, 2}, - {8, 2} - }; - } - - @Test(dataProvider = "goodContaminationFiles") - public void testLoadContaminationFile(final Integer ArtificalBAMnumber, final Integer numberOfSamples) { - final String ArtificialBAM = String.format("ArtificallyContaminatedBams/contamination.case.%d.txt", ArtificalBAMnumber); - Logger logger = org.apache.log4j.Logger.getRootLogger(); - - File ContamFile = new File(privateTestDir, ArtificialBAM); - Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile, 0.0, null, logger).size() == numberOfSamples); - - } - - - @DataProvider(name = "badContaminationFiles") - public Integer[][] badContaminationFiles() { - return new Integer[][]{{1}, {2}, {3}, {4}, {5}}; - } - - @Test(dataProvider = "badContaminationFiles", expectedExceptions = UserException.MalformedFile.class) - public void testLoadBrokenContaminationFile(final int i) { - Logger logger = org.apache.log4j.Logger.getRootLogger(); - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - - File ContaminationFile = new File(ArtificalBAMLocation + String.format("contamination.case.broken.%d.txt", i)); - AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile, 0.0, null, logger); - - } - - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java deleted file mode 100644 index 8f0eee069..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ /dev/null @@ -1,191 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; - -public class FractionalDownsamplerUnitTest extends BaseTest { - - private static class FractionalDownsamplerTest extends TestDataProvider { - double fraction; - int totalReads; - int expectedMinNumReadsAfterDownsampling; - int expectedMaxNumReadsAfterDownsampling; - int expectedMinDiscardedItems; - int expectedMaxDiscardedItems; - - private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent - - public FractionalDownsamplerTest( double fraction, int totalReads ) { - super(FractionalDownsamplerTest.class); - - this.fraction = fraction; - this.totalReads = totalReads; - - calculateExpectations(); - - setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", - getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); - } - - private void calculateExpectations() { - // Require an exact match in the 0% and 100% cases - if ( fraction == 0.0 ) { - expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; - expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; - } - else if ( fraction == 1.0 ) { - expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; - expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; - } - else { - expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); - expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); - expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; - expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; - } - } - - public Collection createReads() { - Collection reads = new ArrayList(totalReads); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); - - return reads; - } - } - - @DataProvider(name = "FractionalDownsamplerTestDataProvider") - public Object[][] createFractionalDownsamplerTestData() { - for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { - for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { - new FractionalDownsamplerTest(fraction, totalReads); - } - } - - return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); - } - - @Test(dataProvider = "FractionalDownsamplerTestDataProvider") - public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); - - downsampler.submit(test.createReads()); - - if ( test.totalReads > 0 ) { - if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - } - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.totalReads > 0 ) { - if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - } - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && - downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); - - Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && - downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final ReadsDownsampler downsampler = new FractionalDownsampler(0.0); - - final Collection reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int i = 1; i <= 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts)); - } - for ( int i = 1; i <= 5; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 5, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 10, "downsampler size() reports wrong number of items"); - - final Collection readsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(readsReturned.size(), 10, "wrong number of items returned by the downsampler"); - - for ( GATKSAMRecord readReturned : readsReturned ) { - Assert.assertTrue(readReturned.isReducedRead(), "non-reduced read survived the downsampling process, but shouldn't have"); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java deleted file mode 100644 index 8cf0fd2a1..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java +++ /dev/null @@ -1,204 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; -import org.testng.Assert; - -import java.util.*; - -public class LevelingDownsamplerUnitTest extends BaseTest { - - private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { - public enum DataStructure { LINKED_LIST, ARRAY_LIST } - - int targetSize; - int numStacks; - int stackSize; - DataStructure dataStructure; - int expectedSize; - - public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { - super(LevelingDownsamplerUniformStacksTest.class); - - this.targetSize = targetSize; - this.numStacks = numStacks; - this.stackSize = stackSize; - this.dataStructure = dataStructure; - expectedSize = calculateExpectedDownsampledStackSize(); - - setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", - getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); - } - - public Collection> createStacks() { - Collection> stacks = new ArrayList>(); - - for ( int i = 1; i <= numStacks; i++ ) { - List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); - - for ( int j = 1; j <= stackSize; j++ ) { - stack.add(new Object()); - } - - stacks.add(stack); - } - - return stacks; - } - - private int calculateExpectedDownsampledStackSize() { - int numItemsToRemove = numStacks * stackSize - targetSize; - - if ( numStacks == 0 ) { - return 0; - } - else if ( numItemsToRemove <= 0 ) { - return stackSize; - } - - return Math.max(1, stackSize - (numItemsToRemove / numStacks)); - } - } - - @DataProvider(name = "UniformStacksDataProvider") - public Object[][] createUniformStacksTestData() { - for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { - for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { - for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { - for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { - new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); - } - } - } - } - - return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); - } - - @Test( dataProvider = "UniformStacksDataProvider" ) - public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); - - downsampler.submit(test.createStacks()); - - if ( test.numStacks > 0 ) { - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.numStacks > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - final int sizeFromDownsampler = downsampler.size(); - List> downsampledStacks = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertEquals(downsampledStacks.size(), test.numStacks); - - int totalRemainingItems = 0; - for ( List stack : downsampledStacks ) { - Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); - totalRemainingItems += stack.size(); - } - - Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); - int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); - int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; - - Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - - Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); - } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final Downsampler> downsampler = new LevelingDownsampler, AlignmentStateMachine>(1); - - final Collection> groups = new LinkedList>(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int alignmentStart : Arrays.asList(1, 2, 3) ) { - final LinkedList group = new LinkedList(); - for ( int i = 1; i <= 10; i++ ) { - group.add(new AlignmentStateMachine(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts))); - } - groups.add(group); - } - - downsampler.submit(groups); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 30, "downsampler size() reports wrong number of items"); - - final Collection> groupsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(groupsReturned.size(), 3, "wrong number of groups returned by the downsampler"); - - for ( LinkedList group : groupsReturned ) { - Assert.assertEquals(group.size(), 10, "group has wrong size after downsampling"); - - for ( AlignmentStateMachine state : group ) { - Assert.assertTrue(state.isReducedRead()); - } - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java deleted file mode 100644 index a50201efd..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java +++ /dev/null @@ -1,174 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -public class ReservoirDownsamplerUnitTest extends BaseTest { - - private static class ReservoirDownsamplerTest extends TestDataProvider { - int reservoirSize; - int totalReads; - int expectedNumReadsAfterDownsampling; - int expectedNumDiscardedItems; - - public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { - super(ReservoirDownsamplerTest.class); - - this.reservoirSize = reservoirSize; - this.totalReads = totalReads; - - expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); - expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; - - setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", - getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); - } - - public Collection createReads() { - Collection reads = new ArrayList(totalReads); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); - - return reads; - } - } - - @DataProvider(name = "ReservoirDownsamplerTestDataProvider") - public Object[][] createReservoirDownsamplerTestData() { - for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { - new ReservoirDownsamplerTest(reservoirSize, 0); - for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { - new ReservoirDownsamplerTest(reservoirSize, totalReads); - } - } - - return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); - } - - @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") - public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); - - downsampler.submit(test.createReads()); - - if ( test.totalReads > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.totalReads > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); - Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final ReadsDownsampler downsampler = new ReservoirDownsampler(1); - - final Collection reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int i = 1; i <= 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts)); - } - for ( int i = 1; i <= 5; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 4, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 11, "downsampler size() reports wrong number of items"); - - final Collection readsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(readsReturned.size(), 11, "wrong number of items returned by the downsampler"); - - int numReducedReadsReturned = 0; - int numNormalReadsReturned = 0; - for ( GATKSAMRecord readReturned : readsReturned ) { - if ( readReturned.isReducedRead() ) { - numReducedReadsReturned++; - } - else { - numNormalReadsReturned++; - } - } - - Assert.assertEquals(numReducedReadsReturned, 10, "wrong number of reduced reads returned by the downsampler"); - Assert.assertEquals(numNormalReadsReturned, 1, "wrong number of non-reduced reads returned by the downsampler"); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java deleted file mode 100644 index bec0030d0..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,375 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -public class SimplePositionalDownsamplerUnitTest extends BaseTest { - - private static class SimplePositionalDownsamplerTest extends TestDataProvider { - int targetCoverage; - int numStacks; - List stackSizes; - List expectedStackSizes; - boolean multipleContigs; - int totalInitialReads; - - public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { - super(SimplePositionalDownsamplerTest.class); - - this.targetCoverage = targetCoverage; - this.numStacks = stackSizes.size(); - this.stackSizes = stackSizes; - this.multipleContigs = multipleContigs; - - calculateExpectedDownsampledStackSizes(); - - totalInitialReads = 0; - for ( Integer stackSize : stackSizes ) { - totalInitialReads += stackSize; - } - - setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", - getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); - } - - public Collection createReads() { - Collection reads = new ArrayList(); - SAMFileHeader header = multipleContigs ? - ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : - ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - int refIndex = 0; - int alignmentStart = 1; - int readLength = 100; - - for ( int i = 0; i < numStacks; i++ ) { - if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { - refIndex++; - } - - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", - refIndex, alignmentStart, readLength)); - - alignmentStart += 10; - } - - return reads; - } - - private void calculateExpectedDownsampledStackSizes() { - expectedStackSizes = new ArrayList(numStacks); - - for ( Integer stackSize : stackSizes ) { - int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; - expectedStackSizes.add(expectedSize); - } - } - } - - @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") - public Object[][] createSimplePositionalDownsamplerTestData() { - GenomeAnalysisEngine.resetRandomGenerator(); - - for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { - for ( int contigs = 1; contigs <= 2; contigs++ ) { - for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { - List stackSizes = new ArrayList(numStacks); - for ( int stack = 1; stack <= numStacks; stack++ ) { - stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); - } - new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); - } - } - } - - return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); - } - - @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) - public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); - - downsampler.submit(test.createReads()); - - if ( test.numStacks > 1 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else if ( test.numStacks == 1 ) { - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.numStacks > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - if ( test.numStacks == 0 ) { - Assert.assertTrue(downsampledReads.isEmpty()); - } - else { - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); - - Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); - Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); - - int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); - int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); - Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); - } - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - - if ( downsampledReads.isEmpty() ) { - return stackSizes; - } - - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } - - @Test - public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection readStack = new ArrayList(); - readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); - downsampler.submit(readStack); - - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - - SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); - downsampler.signalNoMoreReadsBefore(laterRead); - - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(downsampledReads.size(), readStack.size()); - } - - @Test - public void testBasicUnmappedReadsSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection readStack = new ArrayList(); - readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, 100)); - for ( SAMRecord read : readStack ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - downsampler.submit(readStack); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler - Assert.assertEquals(downsampledReads.size(), readStack.size()); - - for ( SAMRecord read: downsampledReads ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - } - - @Test - public void testMixedMappedAndUnmappedReadsSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection mappedReadStack = new ArrayList(); - mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); - for ( SAMRecord read : mappedReadStack ) { - Assert.assertFalse(read.getReadUnmappedFlag()); - } - - Collection unmappedReadStack = new ArrayList(); - unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, 100)); - for ( SAMRecord read : unmappedReadStack ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - downsampler.submit(mappedReadStack); - downsampler.submit(unmappedReadStack); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler - Assert.assertEquals(downsampledReads.size(), 300); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); - - int count = 1; - for ( SAMRecord read: downsampledReads ) { - if ( count <= 100 ) { - Assert.assertFalse(read.getReadUnmappedFlag()); - } - else { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - count++; - } - } - - @Test - public void testGATKSAMRecordSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(downsampledReads.size(), 10); - } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final ReadsDownsampler downsampler = new SimplePositionalDownsampler(1); - - final Collection reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int alignmentStart : Arrays.asList(1, 2, 3) ) { - for ( int i = 1; i <= 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts)); - } - for ( int i = 1; i <= 5; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, alignmentStart, 5)); - } - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 12, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 33, "downsampler size() reports wrong number of items"); - - final Collection readsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(readsReturned.size(), 33, "wrong number of items returned by the downsampler"); - - int numReducedReadsReturned = 0; - int numNormalReadsReturned = 0; - for ( GATKSAMRecord readReturned : readsReturned ) { - if ( readReturned.isReducedRead() ) { - numReducedReadsReturned++; - } - else { - numNormalReadsReturned++; - } - } - - Assert.assertEquals(numReducedReadsReturned, 30, "wrong number of reduced reads returned by the downsampler"); - Assert.assertEquals(numNormalReadsReturned, 3, "wrong number of non-reduced reads returned by the downsampler"); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java deleted file mode 100644 index 4d85997b3..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; - -public class TAROrderedReadCacheUnitTest extends BaseTest { - // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; - - @BeforeClass - public void setup() throws FileNotFoundException { - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - } - - @DataProvider(name = "ReadCacheTestData") - public Object[][] makeReadCacheTestData() { - List tests = new ArrayList(); - - for ( final int nReadsPerLocus : Arrays.asList(0, 1, 10, 100) ) { - for ( final int nLoci : Arrays.asList(1, 10, 100) ) { - for ( final int max : Arrays.asList(10, 50, 1000) ) { - for ( final boolean addAllAtOnce : Arrays.asList(true, false) ) { - tests.add(new Object[]{nReadsPerLocus, nLoci, max, addAllAtOnce}); - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ReadCacheTestData") - public void testReadCache(final int nReadsPerLocus, final int nLoci, final int max, final boolean addAllAtOnce) { - final TAROrderedReadCache cache = new TAROrderedReadCache(max); - - Assert.assertEquals(cache.getMaxCapacity(), max); - Assert.assertEquals(cache.getNumDiscarded(), 0); - Assert.assertEquals(cache.size(), 0); - - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(seq, nReadsPerLocus, nLoci); - final List reads = bamBuilder.makeReads(); - - if ( addAllAtOnce ) { - cache.addAll(reads); - } else { - for ( final GATKSAMRecord read : reads ) { - cache.add(read); - } - } - - final int nTotalReads = reads.size(); - final int nExpectedToKeep = Math.min(nTotalReads, max); - final int nExpectedToDiscard = nTotalReads - nExpectedToKeep; - Assert.assertEquals(cache.getNumDiscarded(), nExpectedToDiscard, "wrong number of reads discarded"); - Assert.assertEquals(cache.size(), nExpectedToKeep, "wrong number of reads kept"); - - final List cacheReads = cache.popCurrentReads(); - Assert.assertEquals(cache.size(), 0, "Should be no reads left"); - Assert.assertEquals(cache.getNumDiscarded(), 0, "should have reset stats"); - Assert.assertEquals(cacheReads.size(), nExpectedToKeep, "should have 1 read for every read we expected to keep"); - - verifySortednessOfReads(cacheReads); - } - - @Test - public void testReadCacheWithReducedReads() { - final List reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int i = 1; i <= 100; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, i, 5, baseCounts)); - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, i, 5)); - } - - final TAROrderedReadCache cache = new TAROrderedReadCache(50); - - cache.addAll(reads); - - // Our cache should have kept all of the reduced reads (which are retained unconditionally and do not count - // towards the capacity limit), and discarded half of the 100 non-reduced reads due to the cache capacity - // limit of 50. - Assert.assertEquals(cache.size(), 150, "wrong number of reads in the cache at the end"); - Assert.assertEquals(cache.getNumDiscarded(), 50, "wrong number of reads discarded from the cache"); - - final List cacheReads = cache.popCurrentReads(); - - int numReducedReadsRetained = 0; - int numNormalReadsRetained = 0; - - for ( GATKSAMRecord read : cacheReads ) { - if ( read.isReducedRead() ) { - numReducedReadsRetained++; - } - else { - numNormalReadsRetained++; - } - } - - Assert.assertEquals(numReducedReadsRetained, 100, "wrong number of reduced reads retained in the cache"); - Assert.assertEquals(numNormalReadsRetained, 50, "wrong number of non-reduced reads retained in the cache"); - - verifySortednessOfReads(cacheReads); - } - - private void verifySortednessOfReads( final List reads) { - int lastStart = -1; - for ( GATKSAMRecord read : reads ) { - Assert.assertTrue(lastStart <= read.getAlignmentStart(), "Reads should be sorted but weren't. Found read with start " + read.getAlignmentStart() + " while last was " + lastStart); - lastStart = read.getAlignmentStart(); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java deleted file mode 100644 index 336c15ccc..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.coverage; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class CallableLociIntegrationTest extends WalkerTest { - final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; - final static String reduceReadArgs = "-R " + b37KGReference + " -T CallableLoci -I " + " private/testdata/NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s"; - - final static String SUMMARY_MD5 = "a6f5963669f19d9d137ced87d65834b0"; - - @Test - public void testCallableLociWalkerBed() { - String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("9b4ffea1dbcfefadeb1c9fa74b0e0e59", SUMMARY_MD5)); - executeTest("formatBed", spec); - } - - @Test - public void testCallableLociWalkerPerBase() { - String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("d6505e489899e80c08a7168777f6e07b", SUMMARY_MD5)); - executeTest("format_state_per_base", spec); - } - - @Test - public void testCallableLociWalker2() { - String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("330f476085533db92a9dbdb3a127c041", "d287510eac04acf5a56f5cde2cba0e4a")); - executeTest("formatBed by interval", spec); - } - - @Test - public void testCallableLociWalker3() { - String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("7f79ad8195c4161060463eeb21d2bb11", "7ee269e5f4581a924529a356cc806e55")); - executeTest("formatBed lots of arguments", spec); - } - - @Test(enabled=true) - public void testWithReducedRead() { - String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, - Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0")); - executeTest("CallableLoci with ReducedRead", spec); - } - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java deleted file mode 100644 index 4d3741228..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.testng.annotations.Test; -import org.broadinstitute.sting.WalkerTest; - -import java.util.Collections; - -/** - * Run validating pileup across a set of core data as proof of the integrity of the GATK core. - * - * @author mhanna - * @version 0.1 - */ -public class CheckPileupIntegrationTest extends WalkerTest { - @Test(enabled = true) - public void testEcoliThreaded() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CheckPileup" + - " -I " + validationDataLocation + "MV1994.selected.bam" + - " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + - " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + - " -S SILENT -nt 8",0, Collections.emptyList()); - executeTest("testEcoliThreaded",spec); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java deleted file mode 100644 index 9d4c562c7..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.collections.Pair; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; -import org.testng.Assert; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: 3/5/11 - * Time: 2:06 PM - * To change this template use File | Settings | File Templates. - */ -public class MWUnitTest extends BaseTest { - @BeforeClass - public void init() { } - - @Test - private void testMWU() { - logger.warn("Testing MWU"); - MannWhitneyU mwu = new MannWhitneyU(); - mwu.add(0, MannWhitneyU.USet.SET1); - mwu.add(1,MannWhitneyU.USet.SET2); - mwu.add(2,MannWhitneyU.USet.SET2); - mwu.add(3,MannWhitneyU.USet.SET2); - mwu.add(4,MannWhitneyU.USet.SET2); - mwu.add(5,MannWhitneyU.USet.SET2); - mwu.add(6,MannWhitneyU.USet.SET1); - mwu.add(7,MannWhitneyU.USet.SET1); - mwu.add(8,MannWhitneyU.USet.SET1); - mwu.add(9,MannWhitneyU.USet.SET1); - mwu.add(10,MannWhitneyU.USet.SET1); - mwu.add(11,MannWhitneyU.USet.SET2); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); - - MannWhitneyU mwu2 = new MannWhitneyU(); - MannWhitneyU mwuNoDither = new MannWhitneyU(false); - for ( int dp : new int[]{2,4,5,6,8} ) { - mwu2.add(dp,MannWhitneyU.USet.SET1); - mwuNoDither.add(dp,MannWhitneyU.USet.SET1); - } - - for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { - mwu2.add(dp,MannWhitneyU.USet.SET2); - mwuNoDither.add(dp,MannWhitneyU.USet.SET2); - } - - MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; - MannWhitneyU.ExactMode cm = MannWhitneyU.ExactMode.CUMULATIVE; - - // tests using the hypothesis that set 2 dominates set 1 (U value = 10) - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); - - Pair sizes = mwu2.getSetSizes(); - - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10L),0.4180519701814064,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10L,false,pm).second,0.021756021756021756,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10L,false).second,0.06214143703127617,1e-14); - logger.warn("Testing two-sided"); - Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8); - - // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30L,true).second,2.0*0.08216463976903321,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30L),0.0023473625009328147,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30L,false,pm).second,0.021756021756021756,1e-14); // note -- exactly same value as above - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).second,1.0-0.08547008547008,1e-14); // r does a correction, subtracting 1 from U - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).second,0.08547008547008,1e-14); // r does a correction, subtracting 1 from U - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).first,-1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).first,1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,pm).first,1.2558754796642067,1e-8); // PDF should be similar - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,pm).first,-1.2558754796642067,1e-8); // PDF should be similar - Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).second,0.0952381,1e-5); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).first,0.0,1e-14); - - logger.warn("Set 1"); - Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8); - logger.warn("Set 2"); - Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8); - - MannWhitneyU mwu3 = new MannWhitneyU(); - for ( int dp : new int[]{0,2,4} ) { - mwu3.add(dp,MannWhitneyU.USet.SET1); - } - for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) { - mwu3.add(dp,MannWhitneyU.USet.SET2); - } - long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1); - //logger.warn(String.format("U is: %d",u)); - Pair nums = mwu3.getSetSizes(); - //logger.warn(String.format("Corrected p is: %.4e",MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u))); - //logger.warn(String.format("Counted sequences: %d",MannWhitneyU.countSequences(nums.first, nums.second, u))); - //logger.warn(String.format("Possible sequences: %d", (long) Arithmetic.binomial(nums.first+nums.second,nums.first))); - //logger.warn(String.format("Ratio: %.4e",MannWhitneyU.countSequences(nums.first,nums.second,u)/Arithmetic.binomial(nums.first+nums.second,nums.first))); - Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first, nums.second, u), 3.665689149560116E-4, 1e-14); - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u,false).second,0.0032240865760884696,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14); - - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java deleted file mode 100644 index de049fe89..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ /dev/null @@ -1,859 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import cern.jet.random.Normal; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for MathUtils - */ -public class MathUtilsUnitTest extends BaseTest { - @BeforeClass - public void init() { - } - - /** - * Tests that we get unqiue values for the valid (non-null-producing) input space for {@link MathUtils#fastGenerateUniqueHashFromThreeIntegers(int, int, int)}. - */ - @Test - public void testGenerateUniqueHashFromThreePositiveIntegers() { - logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers"); - - final Set observedLongs = new HashSet(); - for (short i = 0; i < Byte.MAX_VALUE; i++) { - for (short j = 0; j < Byte.MAX_VALUE; j++) { - for (short k = 0; k < Byte.MAX_VALUE; k++) { - final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); - //System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); - Assert.assertTrue(observedLongs.add(aLong)); - } - } - } - - for (short i = Byte.MAX_VALUE; i <= Short.MAX_VALUE && i > 0; i += 128) { - for (short j = Byte.MAX_VALUE; j <= Short.MAX_VALUE && j > 0; j += 128) { - for (short k = Byte.MAX_VALUE; k <= Short.MAX_VALUE && k > 0; k += 128) { - final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); - // System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); - Assert.assertTrue(observedLongs.add(aLong)); - } - } - } - } - - /** - * Tests that we get the right values from the binomial distribution - */ - @Test - public void testBinomialProbability() { - logger.warn("Executing testBinomialProbability"); - - Assert.assertEquals(MathUtils.binomialProbability(3, 2, 0.5), 0.375, 0.0001); - Assert.assertEquals(MathUtils.binomialProbability(100, 10, 0.5), 1.365543e-17, 1e-18); - Assert.assertEquals(MathUtils.binomialProbability(217, 73, 0.02), 4.521904e-67, 1e-68); - Assert.assertEquals(MathUtils.binomialProbability(300, 100, 0.02), 9.27097e-91, 1e-92); - Assert.assertEquals(MathUtils.binomialProbability(300, 150, 0.98), 6.462892e-168, 1e-169); - Assert.assertEquals(MathUtils.binomialProbability(300, 120, 0.98), 3.090054e-221, 1e-222); - Assert.assertEquals(MathUtils.binomialProbability(300, 112, 0.98), 2.34763e-236, 1e-237); - } - - /** - * Tests that we get the right values from the binomial distribution - */ - @Test - public void testCumulativeBinomialProbability() { - logger.warn("Executing testCumulativeBinomialProbability"); - - for (int j = 0; j < 2; j++) { // Test memoizing functionality, as well. - final int numTrials = 10; - for ( int i = 0; i < numTrials; i++ ) - Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); - - Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); - Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); - Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); - } - } - - /** - * Tests that we get the right values from the multinomial distribution - */ - @Test - public void testMultinomialProbability() { - logger.warn("Executing testMultinomialProbability"); - - int[] counts0 = {2, 0, 1}; - double[] probs0 = {0.33, 0.33, 0.34}; - Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); - - int[] counts1 = {10, 20, 30}; - double[] probs1 = {0.25, 0.25, 0.50}; - Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); - - int[] counts2 = {38, 82, 50, 36}; - double[] probs2 = {0.25, 0.25, 0.25, 0.25}; - Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); - - int[] counts3 = {1, 600, 1}; - double[] probs3 = {0.33, 0.33, 0.34}; - Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); - } - - /** - * Tests that the random index selection is working correctly - */ - @Test - public void testRandomIndicesWithReplacement() { - logger.warn("Executing testRandomIndicesWithReplacement"); - - // Check that the size of the list returned is correct - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 0).size() == 0); - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1).size() == 1); - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 5).size() == 5); - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1000).size() == 1000); - - // Check that the list contains only the k element range that as asked for - no more, no less - List Five = new ArrayList(); - Collections.addAll(Five, 0, 1, 2, 3, 4); - List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); - Assert.assertTrue(BigFive.containsAll(Five)); - Assert.assertTrue(Five.containsAll(BigFive)); - } - - /** - * Tests that we get the right values from the multinomial distribution - */ - @Test - public void testSliceListByIndices() { - logger.warn("Executing testSliceListByIndices"); - - // Check that the list contains only the k element range that as asked for - no more, no less but now - // use the index list to pull elements from another list using sliceListByIndices - List Five = new ArrayList(); - Collections.addAll(Five, 0, 1, 2, 3, 4); - List FiveAlpha = new ArrayList(); - Collections.addAll(FiveAlpha, 'a', 'b', 'c', 'd', 'e'); - List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); - List BigFiveAlpha = MathUtils.sliceListByIndices(BigFive, FiveAlpha); - Assert.assertTrue(BigFiveAlpha.containsAll(FiveAlpha)); - Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); - } - - /** - * Tests that we correctly compute mean and standard deviation from a stream of numbers - */ - @Test - public void testRunningAverage() { - logger.warn("Executing testRunningAverage"); - - int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; - MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - - for (int i = 0; i < numbers.length; i++) - r.add((double) numbers[i]); - - Assert.assertEquals((long) numbers.length, r.observationCount()); - Assert.assertTrue(r.mean() - 3224.625 < 2e-10); - Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); - } - - @Test - public void testLog10Gamma() { - logger.warn("Executing testLog10Gamma"); - - Assert.assertEquals(MathUtils.log10Gamma(4.0), 0.7781513, 1e-6); - Assert.assertEquals(MathUtils.log10Gamma(10), 5.559763, 1e-6); - Assert.assertEquals(MathUtils.log10Gamma(10654), 38280.53, 1e-2); - } - - @Test - public void testLog10BinomialCoefficient() { - logger.warn("Executing testLog10BinomialCoefficient"); - // note that we can test the binomial coefficient calculation indirectly via Newton's identity - // (1+z)^m = sum (m choose k)z^k - double[] z_vals = new double[]{0.999,0.9,0.8,0.5,0.2,0.01,0.0001}; - int[] exponent = new int[]{5,15,25,50,100}; - for ( double z : z_vals ) { - double logz = Math.log10(z); - for ( int exp : exponent ) { - double expected_log = exp*Math.log10(1+z); - double[] newtonArray_log = new double[1+exp]; - for ( int k = 0 ; k <= exp; k++ ) { - newtonArray_log[k] = MathUtils.log10BinomialCoefficient(exp,k)+k*logz; - } - Assert.assertEquals(MathUtils.log10sumLog10(newtonArray_log),expected_log,1e-6); - } - } - - Assert.assertEquals(MathUtils.log10BinomialCoefficient(4, 2), 0.7781513, 1e-6); - Assert.assertEquals(MathUtils.log10BinomialCoefficient(10, 3), 2.079181, 1e-6); - Assert.assertEquals(MathUtils.log10BinomialCoefficient(103928, 119), 400.2156, 1e-4); - } - - @Test - public void testFactorial() { - logger.warn("Executing testFactorial"); - Assert.assertEquals((int) MathUtils.factorial(4), 24); - Assert.assertEquals((int) MathUtils.factorial(10), 3628800); - Assert.assertEquals((int) MathUtils.factorial(12), 479001600); - } - - @Test - public void testLog10Factorial() { - logger.warn("Executing testLog10Factorial"); - Assert.assertEquals(MathUtils.log10Factorial(4), 1.380211, 1e-6); - Assert.assertEquals(MathUtils.log10Factorial(10), 6.559763, 1e-6); - Assert.assertEquals(MathUtils.log10Factorial(12), 8.680337, 1e-6); - Assert.assertEquals(MathUtils.log10Factorial(200), 374.8969, 1e-3); - Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); - double log10factorial_small = 0; - double log10factorial_middle = 374.8969; - double log10factorial_large = 45138.26; - int small_start = 1; - int med_start = 200; - int large_start = 12342; - for ( int i = 1; i < 1000; i++ ) { - log10factorial_small += Math.log10(i+small_start); - log10factorial_middle += Math.log10(i+med_start); - log10factorial_large += Math.log10(i+large_start); - Assert.assertEquals(MathUtils.log10Factorial(small_start+i),log10factorial_small,1e-6); - Assert.assertEquals(MathUtils.log10Factorial(med_start+i),log10factorial_middle,1e-3); - Assert.assertEquals(MathUtils.log10Factorial(large_start+i),log10factorial_large,1e-1); - } - } - - /** - * Private functions used by testArrayShuffle() - */ - private boolean hasUniqueElements(Object[] x) { - for (int i = 0; i < x.length; i++) - for (int j = i + 1; j < x.length; j++) - if (x[i].equals(x[j]) || x[i] == x[j]) - return false; - return true; - } - - private boolean hasAllElements(final Object[] expected, final Object[] actual) { - HashSet set = new HashSet(); - set.addAll(Arrays.asList(expected)); - set.removeAll(Arrays.asList(actual)); - return set.isEmpty(); - } - - @Test - public void testApproximateLog10SumLog10() { - - final double requiredPrecision = 1E-4; - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - - // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity - double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; - int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; - for ( double alpha : mult_partitionFactor ) { - double log_alpha = Math.log10(alpha); - double log_oneMinusAlpha = Math.log10(1-alpha); - for ( int npart : n_partitions ) { - double[] multiplicative = new double[npart]; - double[] equal = new double[npart]; - double remaining_log = 0.0; // realspace = 1 - for ( int i = 0 ; i < npart-1; i++ ) { - equal[i] = -Math.log10(npart); - double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining - multiplicative[i] = piece; - remaining_log = remaining_log + log_oneMinusAlpha; - } - equal[npart-1] = -Math.log10(npart); - multiplicative[npart-1] = remaining_log; - Assert.assertEquals(MathUtils.approximateLog10SumLog10(equal),0.0,requiredPrecision,String.format("Did not sum to one: k=%d equal partitions.",npart)); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(multiplicative),0.0,requiredPrecision, String.format("Did not sum to one: k=%d multiplicative partitions with alpha=%f",npart,alpha)); - } - } - } - - @Test - public void testLog10sumLog10() { - final double requiredPrecision = 1E-14; - - final double log3 = 0.477121254719662; - Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3, requiredPrecision); - - final double log2 = 0.301029995663981; - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0, requiredPrecision); - - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0}), 0.0, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-5.15}), -5.15, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {130.0}), 130.0, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.145}), -0.145, requiredPrecision); - - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - - // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity - double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; - int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; - for ( double alpha : mult_partitionFactor ) { - double log_alpha = Math.log10(alpha); - double log_oneMinusAlpha = Math.log10(1-alpha); - for ( int npart : n_partitions ) { - double[] multiplicative = new double[npart]; - double[] equal = new double[npart]; - double remaining_log = 0.0; // realspace = 1 - for ( int i = 0 ; i < npart-1; i++ ) { - equal[i] = -Math.log10(npart); - double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining - multiplicative[i] = piece; - remaining_log = remaining_log + log_oneMinusAlpha; - } - equal[npart-1] = -Math.log10(npart); - multiplicative[npart-1] = remaining_log; - Assert.assertEquals(MathUtils.log10sumLog10(equal),0.0,requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(multiplicative),0.0,requiredPrecision,String.format("Did not sum to one: nPartitions=%d, alpha=%f",npart,alpha)); - } - } - } - - @Test - public void testLogDotProduct() { - Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); - Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); - } - - @Test - public void testNormalDistribution() { - final double requiredPrecision = 1E-10; - - final Normal n = new Normal(0.0, 1.0, null); - for( final double mu : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { - for( final double sigma : new double[]{1.2, 3.0, 5.8977} ) { - for( final double x : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { - n.setState(mu, sigma); - Assert.assertEquals(n.pdf(x), MathUtils.normalDistribution(mu, sigma, x), requiredPrecision); - Assert.assertEquals(Math.log10(n.pdf(x)), MathUtils.normalDistributionLog10(mu, sigma, x), requiredPrecision); - } - } - } - } - - @DataProvider(name = "ArrayMinData") - public Object[][] makeArrayMinData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList(10), 10}); - tests.add(new Object[]{Arrays.asList(-10), -10}); - - for ( final List values : Utils.makePermutations(Arrays.asList(1,2,3), 3, false) ) { - tests.add(new Object[]{values, 1}); - } - - for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { - tests.add(new Object[]{values, -3}); - } - - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinList(final List values, final int expected) { - final int actual = MathUtils.arrayMin(values); - Assert.assertEquals(actual, expected, "Failed with " + values); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinIntArray(final List values, final int expected) { - final int[] asArray = ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])); - final int actual = MathUtils.arrayMin(asArray); - Assert.assertEquals(actual, expected, "Failed with " + values); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinByteArray(final List values, final int expected) { - final byte[] asArray = new byte[values.size()]; - for ( int i = 0; i < values.size(); i++ ) asArray[i] = (byte)(values.get(i) & 0xFF); - final byte actual = MathUtils.arrayMin(asArray); - Assert.assertEquals(actual, (byte)(expected & 0xFF), "Failed with " + values); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinDoubleArray(final List values, final int expected) { - final double[] asArray = new double[values.size()]; - for ( int i = 0; i < values.size(); i++ ) asArray[i] = (double)(values.get(i)); - final double actual = MathUtils.arrayMin(asArray); - Assert.assertEquals(actual, (double)expected, "Failed with " + values); - } - - @DataProvider(name = "MedianData") - public Object[][] makeMedianData() { - final List tests = new ArrayList<>(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList(10), 10}); - tests.add(new Object[]{Arrays.asList(1, 10), 10}); - - for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { - tests.add(new Object[]{values, 1}); - } - - for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { - tests.add(new Object[]{values, 1.1}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MedianData") - public void testMedian(final List values, final Comparable expected) { - final Comparable actual = MathUtils.median(values); - Assert.assertEquals(actual, expected, "Failed with " + values); - } - - - - // man. All this to test dirichlet. - - private double[] unwrap(List stuff) { - double[] unwrapped = new double[stuff.size()]; - int idx = 0; - for ( Double d : stuff ) { - unwrapped[idx++] = d == null ? 0.0 : d; - } - - return unwrapped; - } - - /** - * The PartitionGenerator generates all of the partitions of a number n, e.g. - * 5 + 0 - * 4 + 1 - * 3 + 2 - * 3 + 1 + 1 - * 2 + 2 + 1 - * 2 + 1 + 1 + 1 - * 1 + 1 + 1 + 1 + 1 - * - * This is used to help enumerate the state space over which the Dirichlet-Multinomial is defined, - * to ensure that the distribution function is properly implemented - */ - class PartitionGenerator implements Iterator> { - // generate the partitions of an integer, each partition sorted numerically - int n; - List a; - int y; - int k; - int state; - int x; - int l; - - public PartitionGenerator(int n) { - this.n = n; - this.y = n - 1; - this.k = 1; - this.a = new ArrayList(); - for ( int i = 0; i < n; i++ ) { - this.a.add(i); - } - this.state = 0; - } - - public void remove() { /* do nothing */ } - - public boolean hasNext() { return ! ( this.k == 0 && state == 0 ); } - - private String dataStr() { - return String.format("a = [%s] k = %d y = %d state = %d x = %d l = %d", - Utils.join(",",a), k, y, state, x, l); - } - - public List next() { - if ( this.state == 0 ) { - this.x = a.get(k-1)+1; - k -= 1; - this.state = 1; - } - - if ( this.state == 1 ) { - while ( 2*x <= y ) { - this.a.set(k,x); - this.y -= x; - this.k++; - } - this.l = 1+this.k; - this.state = 2; - } - - if ( this.state == 2 ) { - if ( x <= y ) { - this.a.set(k,x); - this.a.set(l,y); - x += 1; - y -= 1; - return this.a.subList(0, this.k + 2); - } else { - this.state =3; - } - } - - if ( this.state == 3 ) { - this.a.set(k,x+y); - this.y = x + y - 1; - this.state = 0; - return a.subList(0, k + 1); - } - - throw new IllegalStateException("Cannot get here"); - } - - public String toString() { - StringBuffer buf = new StringBuffer(); - buf.append("{ "); - while ( hasNext() ) { - buf.append("["); - buf.append(Utils.join(",",next())); - buf.append("],"); - } - buf.deleteCharAt(buf.lastIndexOf(",")); - buf.append(" }"); - return buf.toString(); - } - - } - - /** - * NextCounts is the enumerator over the state space of the multinomial dirichlet. - * - * It filters the partition of the total sum to only those with a number of terms - * equal to the number of categories. - * - * It then generates all permutations of that partition. - * - * In so doing it enumerates over the full state space. - */ - class NextCounts implements Iterator { - - private PartitionGenerator partitioner; - private int numCategories; - private int[] next; - - public NextCounts(int numCategories, int totalCounts) { - partitioner = new PartitionGenerator(totalCounts); - this.numCategories = numCategories; - next = nextFromPartitioner(); - } - - public void remove() { /* do nothing */ } - - public boolean hasNext() { return next != null; } - - public int[] next() { - int[] toReturn = clone(next); - next = nextPermutation(); - if ( next == null ) { - next = nextFromPartitioner(); - } - - return toReturn; - } - - private int[] clone(int[] arr) { - int[] a = new int[arr.length]; - for ( int idx = 0; idx < a.length ; idx ++) { - a[idx] = arr[idx]; - } - - return a; - } - - private int[] nextFromPartitioner() { - if ( partitioner.hasNext() ) { - List nxt = partitioner.next(); - while ( partitioner.hasNext() && nxt.size() > numCategories ) { - nxt = partitioner.next(); - } - - if ( nxt.size() > numCategories ) { - return null; - } else { - int[] buf = new int[numCategories]; - for ( int idx = 0; idx < nxt.size(); idx++ ) { - buf[idx] = nxt.get(idx); - } - Arrays.sort(buf); - return buf; - } - } - - return null; - } - - public int[] nextPermutation() { - return MathUtilsUnitTest.nextPermutation(next); - } - - } - - public static int[] nextPermutation(int[] next) { - // the counts can swap among each other. The int[] is originally in ascending order - // this generates the next array in lexicographic order descending - - // locate the last occurrence where next[k] < next[k+1] - int gt = -1; - for ( int idx = 0; idx < next.length-1; idx++) { - if ( next[idx] < next[idx+1] ) { - gt = idx; - } - } - - if ( gt == -1 ) { - return null; - } - - int largestLessThan = gt+1; - for ( int idx = 1 + largestLessThan; idx < next.length; idx++) { - if ( next[gt] < next[idx] ) { - largestLessThan = idx; - } - } - - int val = next[gt]; - next[gt] = next[largestLessThan]; - next[largestLessThan] = val; - - // reverse the tail of the array - int[] newTail = new int[next.length-gt-1]; - int ctr = 0; - for ( int idx = next.length-1; idx > gt; idx-- ) { - newTail[ctr++] = next[idx]; - } - - for ( int idx = 0; idx < newTail.length; idx++) { - next[gt+idx+1] = newTail[idx]; - } - - return next; - } - - - // before testing the dirichlet multinomial, we need to test the - // classes used to test the dirichlet multinomial - - @Test - public void testPartitioner() { - int[] numsToTest = new int[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; - int[] expectedSizes = new int[]{1, 2, 3, 5, 7, 11, 15, 22, 30, 42, 56, 77, 101, 135, 176, 231, 297, 385, 490, 627}; - for ( int testNum = 0; testNum < numsToTest.length; testNum++ ) { - PartitionGenerator gen = new PartitionGenerator(numsToTest[testNum]); - int size = 0; - while ( gen.hasNext() ) { - logger.debug(gen.dataStr()); - size += 1; - gen.next(); - } - Assert.assertEquals(size,expectedSizes[testNum], - String.format("Expected %d partitions, observed %s",expectedSizes[testNum],new PartitionGenerator(numsToTest[testNum]).toString())); - } - } - - @Test - public void testNextPermutation() { - int[] arr = new int[]{1,2,3,4}; - int[][] gens = new int[][] { - new int[]{1,2,3,4}, - new int[]{1,2,4,3}, - new int[]{1,3,2,4}, - new int[]{1,3,4,2}, - new int[]{1,4,2,3}, - new int[]{1,4,3,2}, - new int[]{2,1,3,4}, - new int[]{2,1,4,3}, - new int[]{2,3,1,4}, - new int[]{2,3,4,1}, - new int[]{2,4,1,3}, - new int[]{2,4,3,1}, - new int[]{3,1,2,4}, - new int[]{3,1,4,2}, - new int[]{3,2,1,4}, - new int[]{3,2,4,1}, - new int[]{3,4,1,2}, - new int[]{3,4,2,1}, - new int[]{4,1,2,3}, - new int[]{4,1,3,2}, - new int[]{4,2,1,3}, - new int[]{4,2,3,1}, - new int[]{4,3,1,2}, - new int[]{4,3,2,1} }; - for ( int gen = 0; gen < gens.length; gen ++ ) { - for ( int idx = 0; idx < 3; idx++ ) { - Assert.assertEquals(arr[idx],gens[gen][idx], - String.format("Error at generation %d, expected %s, observed %s",gen,Arrays.toString(gens[gen]),Arrays.toString(arr))); - } - arr = nextPermutation(arr); - } - } - - private double[] addEpsilon(double[] counts) { - double[] d = new double[counts.length]; - for ( int i = 0; i < counts.length; i ++ ) { - d[i] = counts[i] + 1e-3; - } - return d; - } - - @Test - public void testDirichletMultinomial() { - List testAlleles = Arrays.asList( - new double[]{80,240}, - new double[]{1,10000}, - new double[]{0,500}, - new double[]{5140,20480}, - new double[]{5000,800,200}, - new double[]{6,3,1000}, - new double[]{100,400,300,800}, - new double[]{8000,100,20,80,2}, - new double[]{90,20000,400,20,4,1280,720,1} - ); - - Assert.assertTrue(! Double.isInfinite(MathUtils.log10Gamma(1e-3)) && ! Double.isNaN(MathUtils.log10Gamma(1e-3))); - - int[] numAlleleSampled = new int[]{2,5,10,20,25}; - for ( double[] alleles : testAlleles ) { - for ( int count : numAlleleSampled ) { - // test that everything sums to one. Generate all multinomial draws - List likelihoods = new ArrayList(100000); - NextCounts generator = new NextCounts(alleles.length,count); - double maxLog = Double.MIN_VALUE; - //List countLog = new ArrayList(200); - while ( generator.hasNext() ) { - int[] thisCount = generator.next(); - //countLog.add(Arrays.toString(thisCount)); - Double likelihood = MathUtils.dirichletMultinomial(addEpsilon(alleles),thisCount); - Assert.assertTrue(! Double.isNaN(likelihood) && ! Double.isInfinite(likelihood), - String.format("Likelihood for counts %s and nAlleles %d was %s", - Arrays.toString(thisCount),alleles.length,Double.toString(likelihood))); - if ( likelihood > maxLog ) - maxLog = likelihood; - likelihoods.add(likelihood); - } - //System.out.printf("%d likelihoods and max is (probability) %e\n",likelihoods.size(),Math.pow(10,maxLog)); - Assert.assertEquals(MathUtils.sumLog10(unwrap(likelihoods)),1.0,1e-7, - String.format("Counts %d and alleles %d have nLikelihoods %d. \n Counts: %s", - count,alleles.length,likelihoods.size(), "NODEBUG"/*,countLog*/)); - } - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java deleted file mode 100644 index f5c7a14df..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java +++ /dev/null @@ -1,188 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 3/21/12 - */ - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for QualityUtils class - */ -public class QualityUtilsUnitTest extends BaseTest { - final private static double TOLERANCE = 1e-9; - - @BeforeClass - public void init() { - } - - @DataProvider(name = "QualTest") - public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); - - for ( int qual = 0; qual < 255; qual++ ) { - tests.add(new Object[]{(byte)(qual & 0xFF), Math.pow(10.0, ((double)qual)/-10.0)}); - } - - return tests.toArray(new Object[][]{}); - } - - /** - * Example testng test using MyDataProvider - */ - @Test(dataProvider = "QualTest") - public void testMyData(final byte qual, final double errorRate) { - final double trueRate = 1 - errorRate; - - final double actualErrorRate = QualityUtils.qualToErrorProb(qual); - Assert.assertEquals(actualErrorRate, errorRate, TOLERANCE); - final double actualTrueRate = QualityUtils.qualToProb(qual); - Assert.assertEquals(actualTrueRate, trueRate, TOLERANCE); - - // log10 tests - final double actualLog10ErrorRate = QualityUtils.qualToErrorProbLog10(qual); - Assert.assertEquals(actualLog10ErrorRate, Math.log10(errorRate), TOLERANCE); - final double actualLog10TrueRate = QualityUtils.qualToProbLog10(qual); - Assert.assertEquals(actualLog10TrueRate, Math.log10(trueRate), TOLERANCE); - - // test that we can convert our error rates to quals, accounting for boundaries - final int expectedQual = Math.max(Math.min(qual & 0xFF, QualityUtils.MAX_SAM_QUAL_SCORE), 1); - final byte actualQual = QualityUtils.trueProbToQual(trueRate); - Assert.assertEquals(actualQual, expectedQual & 0xFF); - final byte actualQualFromErrorRate = QualityUtils.errorProbToQual(errorRate); - Assert.assertEquals(actualQualFromErrorRate, expectedQual & 0xFF); - - for ( int maxQual = 10; maxQual < QualityUtils.MAX_SAM_QUAL_SCORE; maxQual++ ) { - final byte maxAsByte = (byte)(maxQual & 0xFF); - final byte expectedQual2 = (byte)(Math.max(Math.min(qual & 0xFF, maxQual), 1) & 0xFF); - final byte actualQual2 = QualityUtils.trueProbToQual(trueRate, maxAsByte); - Assert.assertEquals(actualQual2, expectedQual2, "Failed with max " + maxQual); - final byte actualQualFromErrorRate2 = QualityUtils.errorProbToQual(errorRate, maxAsByte); - Assert.assertEquals(actualQualFromErrorRate2, expectedQual2, "Failed with max " + maxQual); - - // test the integer routines - final byte actualQualInt2 = QualityUtils.trueProbToQual(trueRate, maxQual); - Assert.assertEquals(actualQualInt2, expectedQual2, "Failed with max " + maxQual); - final byte actualQualFromErrorRateInt2 = QualityUtils.errorProbToQual(errorRate, maxQual); - Assert.assertEquals(actualQualFromErrorRateInt2, expectedQual2, "Failed with max " + maxQual); - } - } - - @Test - public void testTrueProbWithMinDouble() { - final byte actual = QualityUtils.trueProbToQual(Double.MIN_VALUE); - Assert.assertEquals(actual, 1, "Failed to convert true prob of min double to 1 qual"); - } - - @Test - public void testTrueProbWithVerySmallValue() { - final byte actual = QualityUtils.trueProbToQual(1.7857786272673852E-19); - Assert.assertEquals(actual, 1, "Failed to convert true prob of very small value 1.7857786272673852E-19 to 1 qual"); - } - - @Test - public void testQualCaches() { - Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6); - Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6); - Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6); - Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6); - - Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6); - Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6); - Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6); - Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6); - - Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6); - Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6); - Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6); - Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6); - } - - @Test() - public void testBoundingDefault() { - for ( int qual = 0; qual < 1000; qual++ ) { - final byte expected = (byte)Math.max(Math.min(qual, QualityUtils.MAX_SAM_QUAL_SCORE), 1); - Assert.assertEquals(QualityUtils.boundQual(qual), expected); - } - } - - @Test() - public void testBoundingWithMax() { - for ( int max = 10; max < 255; max += 50 ) { - for ( int qual = 0; qual < 1000; qual++ ) { - final int expected = Math.max(Math.min(qual, max), 1); - Assert.assertEquals(QualityUtils.boundQual(qual, (byte)(max & 0xFF)) & 0xFF, expected & 0xFF, "qual " + qual + " max " + max); - } - } - } - - @DataProvider(name = "PhredScaleDoubleOps") - public Object[][] makePhredDoubleTest() { - List tests = new ArrayList(); - - tests.add(new Object[]{0.0, -10 * Math.log10(Double.MIN_VALUE)}); - tests.add(new Object[]{1.0, 0.0}); - for ( int pow = 1; pow < 20; pow++ ) { - tests.add(new Object[]{Math.pow(10.0, -1.0 * pow), pow * 10}); - tests.add(new Object[]{Math.pow(10.0, -1.5 * pow), pow * 15}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test() - public void testQualToErrorProbDouble() { - for ( double qual = 3.0; qual < 255.0; qual += 0.1 ) { - final double expected = Math.pow(10.0, qual / -10.0); - Assert.assertEquals(QualityUtils.qualToErrorProb(qual), expected, TOLERANCE, "failed qual->error prob for double qual " + qual); - } - } - - - @Test(dataProvider = "PhredScaleDoubleOps") - public void testPhredScaleDoubleOps(final double errorRate, final double expectedPhredScaled) { - final double actualError = QualityUtils.phredScaleErrorRate(errorRate); - Assert.assertEquals(actualError, expectedPhredScaled, TOLERANCE); - final double trueRate = 1 - errorRate; - final double actualTrue = QualityUtils.phredScaleCorrectRate(trueRate); - if ( trueRate == 1.0 ) { - Assert.assertEquals(actualTrue, QualityUtils.MIN_PHRED_SCALED_QUAL); - } else { - final double tol = errorRate < 1e-10 ? 10.0 : 1e-3; - Assert.assertEquals(actualTrue, expectedPhredScaled, tol); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java deleted file mode 100644 index f92cd4bcf..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.TimeUnit; - -public class SimpleTimerUnitTest extends BaseTest { - private final static String NAME = "unit.test.timer"; - - @Test - public void testSimpleTimer() { - SimpleTimer t = new SimpleTimer(NAME); - Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); - Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); - Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); - Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); - - t.start(); - Assert.assertTrue(t.isRunning(), "Started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); - Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); - long n1 = t.getElapsedTimeNano(); - double t1 = t.getElapsedTime(); - idleLoop(); // idle loop to wait a tiny bit of time - long n2 = t.getElapsedTimeNano(); - double t2 = t.getElapsedTime(); - Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); - Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); - - t.stop(); - Assert.assertFalse(t.isRunning(), "Stopped timer still running"); - long n3 = t.getElapsedTimeNano(); - double t3 = t.getElapsedTime(); - idleLoop(); // idle loop to wait a tiny bit of time - double t4 = t.getElapsedTime(); - long n4 = t.getElapsedTimeNano(); - Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); - Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); - - t.restart(); - idleLoop(); // idle loop to wait a tiny bit of time - double t5 = t.getElapsedTime(); - long n5 = t.getElapsedTimeNano(); - Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); - idleLoop(); // idle loop to wait a tiny bit of time - double t6 = t.getElapsedTime(); - long n6 = t.getElapsedTimeNano(); - Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); - Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); - Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); - - final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); - final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); - for ( int i = 0; i < nanoTimes.size(); i++ ) - Assert.assertEquals( - SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), - secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); - } - - @Test - public void testNanoResolution() { - SimpleTimer t = new SimpleTimer(NAME); - - // test the nanosecond resolution - long n7 = t.currentTimeNano(); - int sum = 0; - for ( int i = 0; i < 100; i++) sum += i; - long n8 = t.currentTimeNano(); - final long delta = n8 - n7; - final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); - logger.warn("nanoTime before nano operation " + n7); - logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); - Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); - Assert.assertTrue(delta < oneMilliInNano, - "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); - } - - @Test - public void testMeaningfulTimes() { - SimpleTimer t = new SimpleTimer(NAME); - - t.start(); - for ( int i = 0; i < 100; i++ ) ; - long nano = t.getElapsedTimeNano(); - double secs = t.getElapsedTime(); - - Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); - Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); - - Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); - final long maxTimeInMicro = 10000; - final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(maxTimeInMicro); - Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); - } - - private static void idleLoop() { - for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java deleted file mode 100644 index cbbc8252b..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ /dev/null @@ -1,234 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.clipping; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.TextCigarCodec; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; - -import java.util.LinkedList; -import java.util.List; -import java.util.Stack; - -public class ReadClipperTestUtils { - //Should contain all the utils needed for tests to mass produce - //reads, cigars, and other needed classes - - final static byte [] BASES = {'A', 'C', 'T', 'G'}; - final static byte [] QUALS = {2, 15, 25, 30}; - final static String CIGAR = "4M"; - final static CigarElement[] cigarElements = { new CigarElement(1, CigarOperator.HARD_CLIP), - new CigarElement(1, CigarOperator.SOFT_CLIP), - new CigarElement(1, CigarOperator.INSERTION), - new CigarElement(1, CigarOperator.DELETION), - new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH)}; - - - public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { - return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); - } - - public static GATKSAMRecord makeReadFromCigar(String cigarString) { - return makeReadFromCigar(cigarFromString(cigarString)); - } - - /** - * This function generates every valid permutation of cigar strings with a given length. - * - * A valid cigar object obeys the following rules: - * - No Hard/Soft clips in the middle of the read - * - No deletions in the beginning / end of the read - * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) - * - No consecutive I/D elements - * - * @param maximumLength the maximum number of elements in the cigar - * @return a list with all valid Cigar objects - */ - public static List generateCigarList(int maximumLength) { - int numCigarElements = cigarElements.length; - LinkedList cigarList = new LinkedList(); - byte [] cigarCombination = new byte[maximumLength]; - - Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array. - int currentIndex = 0; - while (true) { - Cigar cigar = createCigarFromCombination(cigarCombination); // create the cigar - cigar = combineAdjacentCigarElements(cigar); // combine adjacent elements - if (isCigarValid(cigar)) { // check if it's valid - cigarList.add(cigar); // add it - } - - boolean currentIndexChanged = false; - while (currentIndex < maximumLength && cigarCombination[currentIndex] == numCigarElements - 1) { - currentIndex++; // find the next index to increment - currentIndexChanged = true; // keep track of the fact that we have changed indices! - } - - if (currentIndex == maximumLength) // if we hit the end of the array, we're done. - break; - - cigarCombination[currentIndex]++; // otherwise advance the current index - - if (currentIndexChanged) { // if we have changed index, then... - for (int i = 0; i < currentIndex; i++) - cigarCombination[i] = 0; // reset everything from 0->currentIndex - currentIndex = 0; // go back to the first index - } - } - - return cigarList; - } - - private static boolean isCigarValid(Cigar cigar) { - if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) - - Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator - CigarOperator startingOp = null; - CigarOperator endingOp = null; - - // check if it doesn't start with deletions - boolean readHasStarted = false; // search the list of elements for the starting operator - for (CigarElement cigarElement : cigar.getCigarElements()) { - if (!readHasStarted) { - if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { - readHasStarted = true; - startingOp = cigarElement.getOperator(); - } - } - cigarElementStack.push(cigarElement); - } - - while (!cigarElementStack.empty()) { - CigarElement cigarElement = cigarElementStack.pop(); - if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { - endingOp = cigarElement.getOperator(); - break; - } - } - - if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION) - return true; // we don't accept reads starting or ending in deletions (add any other constraint here) - } - - return false; - } - - private static Cigar createCigarFromCombination(byte[] cigarCombination) { - Cigar cigar = new Cigar(); - for (byte i : cigarCombination) { - cigar.add(cigarElements[i]); - } - return cigar; - } - - - /** - * Combines equal adjacent elements of a Cigar object - * - * @param rawCigar the cigar object - * @return a combined cigar object - */ - private static Cigar combineAdjacentCigarElements(Cigar rawCigar) { - Cigar combinedCigar = new Cigar(); - CigarElement lastElement = null; - int lastElementLength = 0; - for (CigarElement cigarElement : rawCigar.getCigarElements()) { - if (lastElement != null && - ((lastElement.getOperator() == cigarElement.getOperator()) || - (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || - (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) - lastElementLength += cigarElement.getLength(); - else - { - if (lastElement != null) - combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); - - lastElement = cigarElement; - lastElementLength = cigarElement.getLength(); - } - } - if (lastElement != null) - combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); - - return combinedCigar; - } - - public static GATKSAMRecord makeRead() { - return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); - } - - /** - * Asserts that the two reads have the same bases, qualities and cigar strings - * - * @param actual the calculated read - * @param expected the expected read - */ - public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { - // If they're both not empty, test their contents - if(!actual.isEmpty() && !expected.isEmpty()) { - Assert.assertEquals(actual.getReadBases(), expected.getReadBases()); - Assert.assertEquals(actual.getBaseQualities(), expected.getBaseQualities()); - Assert.assertEquals(actual.getCigarString(), expected.getCigarString()); - } - // Otherwise test if they're both empty - else - Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); - } - - public static Cigar invertCigar (Cigar cigar) { - Stack cigarStack = new Stack(); - for (CigarElement cigarElement : cigar.getCigarElements()) - cigarStack.push(cigarElement); - - Cigar invertedCigar = new Cigar(); - while (!cigarStack.isEmpty()) - invertedCigar.add(cigarStack.pop()); - - return invertedCigar; - } - - /** - * Checks whether or not the read has any cigar element that is not H or S - * - * @param read the read - * @return true if it has any M, I or D, false otherwise - */ - public static boolean readHasNonClippedBases(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) - if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) - return true; - return false; - } - - public static Cigar cigarFromString(String cigarString) { - return TextCigarCodec.getSingleton().decode(cigarString); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java deleted file mode 100644 index d6bd0d4d2..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ /dev/null @@ -1,436 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.clipping; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; - -/** - * User: roger - * Date: 9/28/11 - */ -public class ReadClipperUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - List cigarList; - int maximumCigarSize = 10; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 - - @BeforeClass - public void init() { - cigarList = ReadClipperTestUtils.generateCigarList(maximumCigarSize); - } - - @Test(enabled = !DEBUG) - public void testHardClipBothEndsByReferenceCoordinates() { - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - int readLength = alnStart - alnEnd; - for (int i = 0; i < readLength / 2; i++) { - GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); - Assert.assertTrue(clippedRead.getAlignmentStart() >= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); - Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); - assertUnclippedLimits(read, clippedRead); - } - } - } - - @Test(enabled = !DEBUG) - public void testHardClipByReadCoordinates() { - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int readLength = read.getReadLength(); - for (int i = 0; i < readLength; i++) { - GATKSAMRecord clipLeft = ReadClipper.hardClipByReadCoordinates(read, 0, i); - Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString())); - assertUnclippedLimits(read, clipLeft); - - GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1); - Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); - assertUnclippedLimits(read, clipRight); - } - } - } - - @DataProvider(name = "ClippedReadLengthData") - public Object[][] makeClippedReadLengthData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - final int originalReadLength = 50; - for ( int nToClip = 1; nToClip < originalReadLength - 1; nToClip++ ) { - tests.add(new Object[]{originalReadLength, nToClip}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ClippedReadLengthData", enabled = !DEBUG) - public void testHardClipReadLengthIsRight(final int originalReadLength, final int nToClip) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(originalReadLength + "M"); - read.getReadLength(); // provoke the caching of the read length - final int expectedReadLength = originalReadLength - nToClip; - GATKSAMRecord clipped = ReadClipper.hardClipByReadCoordinates(read, 0, nToClip - 1); - Assert.assertEquals(clipped.getReadLength(), expectedReadLength, - String.format("Clipped read length %d with cigar %s not equal to the expected read length %d after clipping %d bases from the left from a %d bp read with cigar %s", - clipped.getReadLength(), clipped.getCigar(), expectedReadLength, nToClip, read.getReadLength(), read.getCigar())); - } - - @Test(enabled = !DEBUG) - public void testHardClipByReferenceCoordinates() { - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int start = read.getSoftStart(); - int stop = read.getSoftEnd(); - - for (int i = start; i <= stop; i++) { - GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i); - if (!clipLeft.isEmpty()) { - Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); - assertUnclippedLimits(read, clipLeft); - } - - GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. - Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); - assertUnclippedLimits(read, clipRight); - } - } - } - } - - @Test(enabled = !DEBUG) - public void testHardClipByReferenceCoordinatesLeftTail() { - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - for (int i = alnStart; i <= alnEnd; i++) { - GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); - - if (!clipLeft.isEmpty()) { - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); - assertUnclippedLimits(read, clipLeft); - } - } - } - } - } - - @Test(enabled = !DEBUG) - public void testHardClipByReferenceCoordinatesRightTail() { - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - for (int i = alnStart; i <= alnEnd; i++) { - GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. - Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); - assertUnclippedLimits(read, clipRight); - } - } - } - } - } - - @Test(enabled = !DEBUG) - public void testHardClipLowQualEnds() { - final byte LOW_QUAL = 2; - final byte HIGH_QUAL = 30; - - /** create a read for every cigar permutation */ - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int readLength = read.getReadLength(); - byte[] quals = new byte[readLength]; - - for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { - - /** create a read with nLowQualBases in the left tail */ - Utils.fillArrayWithByte(quals, HIGH_QUAL); - for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) - quals[addLeft] = LOW_QUAL; - read.setBaseQualities(quals); - GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - checkClippedReadsForLowQualEnds(read, clipLeft, LOW_QUAL, nLowQualBases); - - /** create a read with nLowQualBases in the right tail */ - Utils.fillArrayWithByte(quals, HIGH_QUAL); - for (int addRight = 0; addRight < nLowQualBases; addRight++) - quals[readLength - addRight - 1] = LOW_QUAL; - read.setBaseQualities(quals); - GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - checkClippedReadsForLowQualEnds(read, clipRight, LOW_QUAL, nLowQualBases); - - /** create a read with nLowQualBases on both tails */ - if (nLowQualBases <= readLength / 2) { - Utils.fillArrayWithByte(quals, HIGH_QUAL); - for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { - quals[addBoth] = LOW_QUAL; - quals[readLength - addBoth - 1] = LOW_QUAL; - } - read.setBaseQualities(quals); - GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - checkClippedReadsForLowQualEnds(read, clipBoth, LOW_QUAL, 2*nLowQualBases); - } - } - } - } - - @Test(enabled = !DEBUG) - public void testHardClipSoftClippedBases() { - for (Cigar cigar : cigarList) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); - CigarCounter original = new CigarCounter(read); - CigarCounter clipped = new CigarCounter(clippedRead); - - assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed - original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS - } - } - - @Test(enabled = false) - public void testHardClipLeadingInsertions() { - for (Cigar cigar : cigarList) { - if (startsWithInsertion(cigar)) { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); - - assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed - - int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); - if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) - expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); - - if (!clippedRead.isEmpty()) { - Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there - Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone - } else - Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped - } - } - } - - @Test(enabled = !DEBUG) - public void testRevertSoftClippedBases() { - for (Cigar cigar : cigarList) { - final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); - final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); - - final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); - - assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed - - if (leadingSoftClips > 0 || tailSoftClips > 0) { - final int expectedStart = read.getAlignmentStart() - leadingSoftClips; - final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; - - Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); - Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); - } else - Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); - } - } - - @Test(enabled = !DEBUG) - public void testRevertSoftClippedBasesWithThreshold() { - for (Cigar cigar : cigarList) { - final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); - final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); - - final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); - - assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed - Assert.assertNull(read.getCigar().isValid(null, -1)); - Assert.assertNull(unclipped.getCigar().isValid(null, -1)); - - if (!(leadingSoftClips > 0 || tailSoftClips > 0)) - Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); - - } - } - - @DataProvider(name = "RevertSoftClipsBeforeContig") - public Object[][] makeRevertSoftClipsBeforeContig() { - List tests = new ArrayList<>(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - for ( int softStart : Arrays.asList(-10, -1, 0) ) { - for ( int alignmentStart : Arrays.asList(1, 10) ) { - tests.add(new Object[]{softStart, alignmentStart}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "RevertSoftClipsBeforeContig") - public void testRevertSoftClippedBasesBeforeStartOfContig(final int softStart, final int alignmentStart) { - final int nMatches = 10; - final int nSoft = -1 * (softStart - alignmentStart); - final String cigar = nSoft + "S" + nMatches + "M"; - final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - read.setAlignmentStart(alignmentStart); - - Assert.assertEquals(read.getSoftStart(), softStart); - Assert.assertEquals(read.getAlignmentStart(), alignmentStart); - Assert.assertEquals(read.getCigarString(), cigar); - - final GATKSAMRecord reverted = ReadClipper.revertSoftClippedBases(read); - - final int expectedAlignmentStart = 1; - final String expectedCigar = (1 - softStart) + "H" + read.getAlignmentEnd() + "M"; - Assert.assertEquals(reverted.getSoftStart(), expectedAlignmentStart); - Assert.assertEquals(reverted.getAlignmentStart(), expectedAlignmentStart); - Assert.assertEquals(reverted.getCigarString(), expectedCigar); - } - - private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { - if (!read.isEmpty()) { - byte[] quals = read.getBaseQualities(); - for (int i = 0; i < quals.length; i++) - Assert.assertFalse(quals[i] <= low_qual, String.format("Found low qual (%d) base after hard clipping. Position: %d -- %s", low_qual, i, read.getCigarString())); - } - } - - private void checkClippedReadsForLowQualEnds(GATKSAMRecord read, GATKSAMRecord clippedRead, byte lowQual, int nLowQualBases) { - assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed - assertNoLowQualBases(clippedRead, lowQual); // Make sure the low qualities are gone - } - - /** - * Asserts that clipping doesn't change the getUnclippedStart / getUnclippedEnd - * - * @param original original read - * @param clipped clipped read - */ - private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) { - if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) { - Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart()); - Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd()); - } - } - - private boolean startsWithInsertion(Cigar cigar) { - return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0; - } - - private int leadingCigarElementLength(Cigar cigar, CigarOperator operator) { - for (CigarElement cigarElement : cigar.getCigarElements()) { - if (cigarElement.getOperator() == operator) - return cigarElement.getLength(); - if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) - break; - } - return 0; - } - - private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) { - for (CigarElement cigarElement : cigar.getCigarElements()) - if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) - return true; - return false; - } - - private class CigarCounter { - private HashMap counter; - - public Integer getCounterForOp(CigarOperator operator) { - return counter.get(operator); - } - - public CigarCounter(GATKSAMRecord read) { - CigarOperator[] operators = CigarOperator.values(); - counter = new HashMap(operators.length); - - for (CigarOperator op : operators) - counter.put(op, 0); - - for (CigarElement cigarElement : read.getCigar().getCigarElements()) - counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength()); - } - - public boolean assertHardClippingSoftClips(CigarCounter clipped) { - for (CigarOperator op : counter.keySet()) { - if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) { - int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP); - int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP); - int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP); - - Assert.assertEquals(counterTotal, clippedHard); - Assert.assertTrue(clippedSoft == 0); - } else - Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op)); - } - return true; - } - - } - - @Test(enabled = !DEBUG) - public void testHardClipReducedRead() { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M"); - final int[] counts = new int[read.getReadLength()]; - for ( int i = 0; i < counts.length; i++ ) counts[i] = i; - read.setReducedReadCounts(counts); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - int readLength = read.getReadLength(); - for (int i = 0; i < readLength / 2; i++) { - GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); - final int[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1); - Assert.assertEquals(clippedRead.getReducedReadCounts(), expectedReducedCounts); - } - } - - @Test(enabled = !DEBUG) - public void testRevertEntirelySoftclippedReads() { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("2H1S3H"); - GATKSAMRecord clippedRead = ReadClipper.revertSoftClippedBases(read); - Assert.assertEquals(clippedRead.getAlignmentStart(), read.getSoftStart()); - } - -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java deleted file mode 100644 index 64a71f060..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java +++ /dev/null @@ -1,326 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.io; - -import org.apache.commons.io.FileUtils; -import org.broadinstitute.sting.BaseTest; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -public class IOUtilsUnitTest extends BaseTest { - @Test - public void testGoodTempDir() { - IOUtils.checkTempDir(new File("/tmp/queue")); - } - - @Test(expectedExceptions=UserException.BadTmpDir.class) - public void testBadTempDir() { - IOUtils.checkTempDir(new File("/tmp")); - } - - @Test - public void testAbsoluteSubDir() { - File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); - Assert.assertEquals(subDir, new File("/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); - Assert.assertEquals(subDir, new File("/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/path"), new File(".")); - Assert.assertEquals(subDir, new File("/different/path")); - } - - @Test - public void testRelativeSubDir() throws IOException { - File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); - Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); - - subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); - Assert.assertEquals(subDir, new File("/different/path/path/to/file")); - } - - @Test - public void testDottedSubDir() throws IOException { - File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); - Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); - - subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); - Assert.assertEquals(subDir, new File("/path/../to/file")); - - subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); - Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); - Assert.assertEquals(subDir, new File("/path/../to/file")); - } - - @Test - public void testTempDir() { - File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); - Assert.assertTrue(tempDir.exists()); - Assert.assertFalse(tempDir.isFile()); - Assert.assertTrue(tempDir.isDirectory()); - boolean deleted = IOUtils.tryDelete(tempDir); - Assert.assertTrue(deleted); - Assert.assertFalse(tempDir.exists()); - } - - @Test - public void testDirLevel() { - File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); - Assert.assertEquals(dir, new File("/path")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); - Assert.assertEquals(dir, new File("/path/to")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); - Assert.assertEquals(dir, new File("/path/to/directory")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); - Assert.assertEquals(dir, new File("/path/to/directory")); - } - - @Test - public void testAbsolute() { - File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); - Assert.assertEquals(dir, new File("/path/to/directory")); - - dir = IOUtils.absolute(new File("/")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/.")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/././.")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/./directory/.")); - Assert.assertEquals(dir, new File("/directory")); - - dir = IOUtils.absolute(new File("/./directory/./")); - Assert.assertEquals(dir, new File("/directory")); - - dir = IOUtils.absolute(new File("/./directory./")); - Assert.assertEquals(dir, new File("/directory.")); - - dir = IOUtils.absolute(new File("/./.directory/")); - Assert.assertEquals(dir, new File("/.directory")); - } - - @Test - public void testTail() throws IOException { - List lines = Arrays.asList( - "chr18_random 4262 3154410390 50 51", - "chr19_random 301858 3154414752 50 51", - "chr21_random 1679693 3154722662 50 51", - "chr22_random 257318 3156435963 50 51", - "chrX_random 1719168 3156698441 50 51"); - List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); - Assert.assertEquals(tail.size(), 5); - for (int i = 0; i < 5; i++) - Assert.assertEquals(tail.get(i), lines.get(i)); - } - - @Test - public void testWriteSystemFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("StingText.properties", null), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteSystemTempFile() throws IOException { - File temp = IOUtils.writeTempResource(new Resource("StingText.properties", null)); - try { - Assert.assertTrue(temp.getName().startsWith("StingText"), "File does not start with 'StingText.': " + temp); - Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMissingSystemFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteRelativeFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("/StingText.properties", IOUtils.class), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteRelativeTempFile() throws IOException { - File temp = IOUtils.writeTempResource(new Resource("/StingText.properties", IOUtils.class)); - try { - Assert.assertTrue(temp.getName().startsWith("StingText"), "File does not start with 'StingText.': " + temp); - Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMissingRelativeFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - // Looking for /org/broadinstitute/sting/utils/file/StingText.properties - IOUtils.writeResource(new Resource("StingText.properties", IOUtils.class), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testResourceProperties() { - Resource resource = new Resource("foo", Resource.class); - Assert.assertEquals(resource.getPath(), "foo"); - Assert.assertEquals(resource.getRelativeClass(), Resource.class); - } - - @Test - public void testIsSpecialFile() { - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); - Assert.assertFalse(IOUtils.isSpecialFile(null)); - Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); - Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); - } - - @DataProvider( name = "ByteArrayIOTestData") - public Object[][] byteArrayIOTestDataProvider() { - return new Object[][] { - // file size, read buffer size - { 0, 4096 }, - { 1, 4096 }, - { 2000, 4096 }, - { 4095, 4096 }, - { 4096, 4096 }, - { 4097, 4096 }, - { 6000, 4096 }, - { 8191, 4096 }, - { 8192, 4096 }, - { 8193, 4096 }, - { 10000, 4096 } - }; - } - - @Test( dataProvider = "ByteArrayIOTestData" ) - public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { - File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); - - byte[] dataWritten = getDeterministicRandomData(fileSize); - IOUtils.writeByteArrayToFile(dataWritten, tempFile); - byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); - - Assert.assertEquals(dataRead.length, dataWritten.length); - Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); - } - - @Test( dataProvider = "ByteArrayIOTestData" ) - public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { - File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); - - byte[] dataWritten = getDeterministicRandomData(fileSize); - IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); - byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); - - Assert.assertEquals(dataRead.length, dataWritten.length); - Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); - } - - @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) - public void testReadNonExistentFileIntoByteArray() { - File nonExistentFile = new File("djfhsdkjghdfk"); - Assert.assertFalse(nonExistentFile.exists()); - - IOUtils.readFileIntoByteArray(nonExistentFile); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testReadNullStreamIntoByteArray() { - IOUtils.readStreamIntoByteArray(null); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { - IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), - -1); - } - - @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) - public void testWriteByteArrayToUncreatableFile() { - IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testWriteNullByteArrayToFile() { - IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testWriteByteArrayToNullStream() { - IOUtils.writeByteArrayToStream(new byte[]{0}, null); - } - - private byte[] getDeterministicRandomData ( int size ) { - GenomeAnalysisEngine.resetRandomGenerator(); - Random rand = GenomeAnalysisEngine.getRandomGenerator(); - - byte[] randomData = new byte[size]; - rand.nextBytes(randomData); - - return randomData; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java deleted file mode 100644 index 888ab7f7f..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java +++ /dev/null @@ -1,191 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pileup; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; -import org.broadinstitute.sting.utils.locusiterator.LIBS_position; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByStateBaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -/** - * testing of the new (non-legacy) version of LocusIteratorByState - */ -public class PileupElementUnitTest extends LocusIteratorByStateBaseTest { - @DataProvider(name = "PileupElementTest") - public Object[][] makePileupElementTest() { -// return new Object[][]{{new LIBSTest("2X2D2P2X")}}; -// return createLIBSTests( -// Arrays.asList(2), -// Arrays.asList(2)); - return createLIBSTests( - Arrays.asList(1, 2), - Arrays.asList(1, 2, 3, 4)); - } - - @Test(dataProvider = "PileupElementTest") - public void testPileupElementTest(LIBSTest params) { - final GATKSAMRecord read = params.makeRead(); - final AlignmentStateMachine state = new AlignmentStateMachine(read); - final LIBS_position tester = new LIBS_position(read); - - while ( state.stepForwardOnGenome() != null ) { - tester.stepForwardOnGenome(); - final PileupElement pe = state.makePileupElement(); - - Assert.assertEquals(pe.getRead(), read); - Assert.assertEquals(pe.getMappingQual(), read.getMappingQuality()); - Assert.assertEquals(pe.getOffset(), state.getReadOffset()); - - Assert.assertEquals(pe.isDeletion(), state.getCigarOperator() == CigarOperator.D); - Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); - Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); - Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); - - if ( ! hasNeighboringPaddedOps(params.getElements(), pe.getCurrentCigarOffset()) ) { - Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); - Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); - } - - - - Assert.assertEquals(pe.atEndOfCurrentCigar(), state.getOffsetIntoCurrentCigarElement() == state.getCurrentCigarElement().getLength() - 1, "atEndOfCurrentCigar failed"); - Assert.assertEquals(pe.atStartOfCurrentCigar(), state.getOffsetIntoCurrentCigarElement() == 0, "atStartOfCurrentCigar failed"); - - Assert.assertEquals(pe.getBase(), pe.isDeletion() ? PileupElement.DELETION_BASE : read.getReadBases()[state.getReadOffset()]); - Assert.assertEquals(pe.getQual(), pe.isDeletion() ? PileupElement.DELETION_QUAL : read.getBaseQualities()[state.getReadOffset()]); - - Assert.assertEquals(pe.getCurrentCigarElement(), state.getCurrentCigarElement()); - Assert.assertEquals(pe.getCurrentCigarOffset(), state.getCurrentCigarElementOffset()); - - // tested in libs - //pe.getLengthOfImmediatelyFollowingIndel(); - //pe.getBasesOfImmediatelyFollowingInsertion(); - - // Don't test -- pe.getBaseIndex(); - if ( pe.atEndOfCurrentCigar() && state.getCurrentCigarElementOffset() < read.getCigarLength() - 1 ) { - final CigarElement nextElement = read.getCigar().getCigarElement(state.getCurrentCigarElementOffset() + 1); - if ( nextElement.getOperator() == CigarOperator.I ) { - Assert.assertTrue(pe.getBetweenNextPosition().size() >= 1); - Assert.assertEquals(pe.getBetweenNextPosition().get(0), nextElement); - } - if ( nextElement.getOperator() == CigarOperator.M ) { - Assert.assertTrue(pe.getBetweenNextPosition().isEmpty()); - } - } else { - Assert.assertTrue(pe.getBetweenNextPosition().isEmpty()); - } - - if ( pe.atStartOfCurrentCigar() && state.getCurrentCigarElementOffset() > 0 ) { - final CigarElement prevElement = read.getCigar().getCigarElement(state.getCurrentCigarElementOffset() - 1); - if ( prevElement.getOperator() == CigarOperator.I ) { - Assert.assertTrue(pe.getBetweenPrevPosition().size() >= 1); - Assert.assertEquals(pe.getBetweenPrevPosition().getLast(), prevElement); - } - if ( prevElement.getOperator() == CigarOperator.M ) { - Assert.assertTrue(pe.getBetweenPrevPosition().isEmpty()); - } - } else { - Assert.assertTrue(pe.getBetweenPrevPosition().isEmpty()); - } - - // TODO -- add meaningful tests - pe.getBaseInsertionQual(); - pe.getBaseDeletionQual(); - pe.getRepresentativeCount(); - } - } - - - @DataProvider(name = "PrevAndNextTest") - public Object[][] makePrevAndNextTest() { - final List tests = new LinkedList(); - - final List operators = Arrays.asList(CigarOperator.I, CigarOperator.P, CigarOperator.S); - - for ( final CigarOperator firstOp : Arrays.asList(CigarOperator.M) ) { - for ( final CigarOperator lastOp : Arrays.asList(CigarOperator.M, CigarOperator.D) ) { - for ( final int nIntermediate : Arrays.asList(1, 2, 3) ) { - for ( final List combination : Utils.makePermutations(operators, nIntermediate, false) ) { - final int readLength = 2 + combination.size(); - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); - - String cigar = "1" + firstOp; - for ( final CigarOperator op : combination ) cigar += "1" + op; - cigar += "1" + lastOp; - read.setCigarString(cigar); - - tests.add(new Object[]{read, firstOp, lastOp, combination}); - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "PrevAndNextTest") - public void testPrevAndNextTest(final GATKSAMRecord read, final CigarOperator firstOp, final CigarOperator lastOp, final List ops) { - final AlignmentStateMachine state = new AlignmentStateMachine(read); - - state.stepForwardOnGenome(); - final PileupElement pe = state.makePileupElement(); - Assert.assertEquals(pe.getBetweenNextPosition().size(), ops.size()); - Assert.assertEquals(pe.getBetweenPrevPosition().size(), 0); - assertEqualsOperators(pe.getBetweenNextPosition(), ops); - Assert.assertEquals(pe.getPreviousOnGenomeCigarElement(), null); - Assert.assertNotNull(pe.getNextOnGenomeCigarElement()); - Assert.assertEquals(pe.getNextOnGenomeCigarElement().getOperator(), lastOp); - - state.stepForwardOnGenome(); - final PileupElement pe2 = state.makePileupElement(); - Assert.assertEquals(pe2.getBetweenPrevPosition().size(), ops.size()); - Assert.assertEquals(pe2.getBetweenNextPosition().size(), 0); - assertEqualsOperators(pe2.getBetweenPrevPosition(), ops); - Assert.assertNotNull(pe2.getPreviousOnGenomeCigarElement()); - Assert.assertEquals(pe2.getPreviousOnGenomeCigarElement().getOperator(), firstOp); - Assert.assertEquals(pe2.getNextOnGenomeCigarElement(), null); - } - - private void assertEqualsOperators(final List elements, final List ops) { - for ( int i = 0; i < elements.size(); i++ ) { - Assert.assertEquals(elements.get(i).getOperator(), ops.get(i), "elements doesn't have expected operator at position " + i); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java deleted file mode 100644 index 767646963..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java +++ /dev/null @@ -1,121 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.progressmeter; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -/** - * UnitTests for the ProgressMeterDaemon - * - * User: depristo - * Date: 8/24/12 - * Time: 11:25 AM - * To change this template use File | Settings | File Templates. - */ -public class ProgressMeterDaemonUnitTest extends BaseTest { - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() throws FileNotFoundException { - genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); - } - - // capture and count calls to progress - private class TestingProgressMeter extends ProgressMeter { - final List progressCalls = new LinkedList(); - - private TestingProgressMeter(final long poll) { - super(null, "test", new GenomeLocSortedSet(genomeLocParser), poll); - super.start(); - } - - @Override - protected synchronized void printProgress(boolean mustPrint) { - progressCalls.add(System.currentTimeMillis()); - } - } - - @DataProvider(name = "PollingData") - public Object[][] makePollingData() { - List tests = new ArrayList(); - for ( final int ticks : Arrays.asList(1, 5, 10) ) { - for ( final int poll : Arrays.asList(10, 100) ) { - tests.add(new Object[]{poll, ticks}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test - public void testPeriodUpdateNano() { - final ProgressMeter meter = new TestingProgressMeter(10); - final long currentTime = meter.getRuntimeInNanoseconds(); - meter.updateElapsedTimeInNanoseconds(); - Assert.assertTrue( meter.getRuntimeInNanosecondsUpdatedPeriodically() > currentTime, "Updating the periodic runtime failed" ); - } - - @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90) - public void testProgressMeterDaemon(final long poll, final int ticks) throws InterruptedException { - final TestingProgressMeter meter = new TestingProgressMeter(poll); - final ProgressMeterDaemon daemon = meter.getProgressMeterDaemon(); - - Assert.assertTrue(daemon.isDaemon()); - - Assert.assertFalse(daemon.isDone()); - Thread.sleep(ticks * poll); - Assert.assertFalse(daemon.isDone()); - - daemon.done(); - Assert.assertTrue(daemon.isDone()); - - // wait for the thread to actually finish - daemon.join(); - - Assert.assertTrue(meter.progressCalls.size() >= 1, - "Expected at least one progress update call from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); - - final int tolerance = (int)Math.ceil(0.8 * meter.progressCalls.size()); - Assert.assertTrue(Math.abs(meter.progressCalls.size() - ticks) <= tolerance, - "Expected " + ticks + " progress calls from daemon thread, but got " + meter.progressCalls.size() + " and a tolerance of only " + tolerance); - - Assert.assertTrue(meter.getRuntimeInNanosecondsUpdatedPeriodically() > 0, "Daemon should have updated our periodic runtime"); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java deleted file mode 100644 index e9af685a6..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ /dev/null @@ -1,232 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; - - -public class GATKSAMRecordUnitTest extends BaseTest { - GATKSAMRecord read, reducedRead; - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; - final private static int[] REDUCED_READ_COUNTS = new int[]{10, 20, 30, 40}; - - @BeforeClass - public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); - read.setReadBases(new String(BASES).getBytes()); - read.setBaseQualityString(new String(QUALS)); - - reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); - reducedRead.setReadBases(BASES.getBytes()); - reducedRead.setBaseQualityString(QUALS); - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - } - - @Test - public void testReducedReads() { - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - - Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); - Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); - - Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); - for (int i = 0; i < reducedRead.getReadLength(); i++) { - Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testGetReducedCountOnNormalRead() { - read.getReducedCount(0); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testSetReducedTagOnNormalRead() { - read.setReducedCount(0, 2); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testAdjustReducedCountToNegativeNumber() { - reducedRead.setReducedCount(0, 1); - reducedRead.adjustReducedCount(0, -2); - } - - @Test - public void testSetReducedCountOnReducedRead() { - for (int i = 0; i < reducedRead.getReadLength(); i++) { - final byte newCount = (byte)i; - reducedRead.setReducedCount(i, newCount); - Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); - } - - for (int i = 0; i < reducedRead.getReadLength(); i++) { - final int newCount = reducedRead.getReducedCount(i) + i; - reducedRead.adjustReducedCount(i, i); - Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); - } - } - - @Test - public void testReducedReadEncodeAndDecode() { - - // encode - byte[] encoded = GATKSAMRecord.encodeReduceReadCounts(REDUCED_READ_COUNTS); - - // decode - int[] decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); - - // for the heck of it, let's encode and decode again! - encoded = GATKSAMRecord.encodeReduceReadCounts(decoded); - decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); - - for (int i = 0; i < decoded.length; i++) - Assert.assertEquals(decoded[i], REDUCED_READ_COUNTS[i]); - } - - @Test - public void testByteBoundsOnReducedTag() { - reducedRead.setReducedCount(0, 1000); - reducedRead.setReducedReadCountsTag(); - reducedRead.adjustReducedCount(0, -255); - Assert.assertEquals(reducedRead.getReducedCount(0), 0); - } - - @Test - public void testReducedReadPileupElement() { - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - - PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0); - PileupElement reducedreadp = LocusIteratorByState.createPileupForReadAndOffset(reducedRead, 0); - - Assert.assertFalse(readp.getRead().isReducedRead()); - - Assert.assertTrue(reducedreadp.getRead().isReducedRead()); - Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); - Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); - } - - @Test - public void testGetOriginalAlignments() { - final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; - final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); - - // A regular read with all matches - Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); - - // Alignment start shifted - int alignmentShift = 2; - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, alignmentShift); - Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); - - // Both alignments shifted - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, alignmentShift); - Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); - - // Alignment end shifted - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null); - Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); - } - - @Test - public void testStrandlessReads() { - final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; - final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); - Assert.assertEquals(read.isStrandless(), false); - - read.setReadNegativeStrandFlag(false); - Assert.assertEquals(read.isStrandless(), false); - Assert.assertEquals(read.getReadNegativeStrandFlag(), false); - - read.setReadNegativeStrandFlag(true); - Assert.assertEquals(read.isStrandless(), false); - Assert.assertEquals(read.getReadNegativeStrandFlag(), true); - - read.setReadNegativeStrandFlag(true); - read.setIsStrandless(true); - Assert.assertEquals(read.isStrandless(), true); - Assert.assertEquals(read.getReadNegativeStrandFlag(), false, "negative strand flag should return false even through its set for a strandless read"); - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testStrandlessReadsFailSetStrand() { - final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; - final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); - read.setIsStrandless(true); - read.setReadNegativeStrandFlag(true); - } - - @Test - public void testGetReducedCountsIsCorrect() { - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - final int[] counts = reducedRead.getReducedReadCounts(); - Assert.assertNotSame(counts, reducedRead.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); - for ( int i = 0; i < counts.length; i++ ) - Assert.assertEquals(counts[i], reducedRead.getReducedCount(i), "Reduced counts vector not equal to getReducedCount(i) at " + i); - } - - @DataProvider(name = "ReducedReadCountConversionProvider") - public Object[][] ReducedReadCountConversionTestData() { - List tests = new ArrayList(); - - tests.add(new Object[]{new int[] {100, 100, 100, 101}, new byte[] {100, 0, 0, 1}}); - tests.add(new Object[]{new int[] {1, 100, 100, 0}, new byte[] {1, 99, 99, -1}}); - tests.add(new Object[]{new int[] {127, 100, 0, 1}, new byte[] {127, -27, -127, -126}}); - tests.add(new Object[]{new int[] {1, 127, 51, 126}, new byte[] {1, 126, 50, 125}}); - tests.add(new Object[]{new int[] {300, 127, 1, 255}, new byte[] {-1, -128, 2, 0}}); - tests.add(new Object[]{new int[] {1, 300, 51, 126}, new byte[] {1, -2, 50, 125}}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ReducedReadCountConversionProvider", enabled = true) - public void reducedReadCountConversionTest(final int[] counts, final byte[] expectedConversion) { - - reducedRead.setReducedReadCountsTag(counts); - final byte[] actualConversion = reducedRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG); - for ( int i = 0; i < actualConversion.length; i++ ) - Assert.assertEquals(actualConversion[i], expectedConversion[i], "Conversion differs at position " + i + ": " + actualConversion[i] + " vs. " + expectedConversion[i]); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java deleted file mode 100644 index 7e085547f..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ /dev/null @@ -1,324 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - - -public class ReadUtilsUnitTest extends BaseTest { - private interface GetAdaptorFunc { - public int getAdaptor(final GATKSAMRecord record); - } - - @DataProvider(name = "AdaptorGetter") - public Object[][] makeActiveRegionCutTests() { - final List tests = new LinkedList(); - - tests.add( new Object[]{ new GetAdaptorFunc() { - @Override public int getAdaptor(final GATKSAMRecord record) { return ReadUtils.getAdaptorBoundary(record); } - }}); - - tests.add( new Object[]{ new GetAdaptorFunc() { - @Override public int getAdaptor(final GATKSAMRecord record) { return record.getAdaptorBoundary(); } - }}); - - return tests.toArray(new Object[][]{}); - } - - private GATKSAMRecord makeRead(final int fragmentSize, final int mateStart) { - final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; - final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; - final String cigar = "8M"; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); - read.setProperPairFlag(true); - read.setReadPairedFlag(true); - read.setMateAlignmentStart(mateStart); - read.setInferredInsertSize(fragmentSize); - return read; - } - - @Test(dataProvider = "AdaptorGetter") - public void testGetAdaptorBoundary(final GetAdaptorFunc get) { - final int fragmentSize = 10; - final int mateStart = 1000; - final int BEFORE = mateStart - 2; - final int AFTER = mateStart + 2; - int myStart, boundary; - GATKSAMRecord read; - - // Test case 1: positive strand, first read - read = makeRead(fragmentSize, mateStart); - myStart = BEFORE; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, myStart + fragmentSize + 1); - - // Test case 2: positive strand, second read - read = makeRead(fragmentSize, mateStart); - myStart = AFTER; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, myStart + fragmentSize + 1); - - // Test case 3: negative strand, second read - read = makeRead(fragmentSize, mateStart); - myStart = AFTER; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, mateStart - 1); - - // Test case 4: negative strand, first read - read = makeRead(fragmentSize, mateStart); - myStart = BEFORE; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, mateStart - 1); - - // Test case 5: mate is mapped to another chromosome (test both strands) - read = makeRead(fragmentSize, mateStart); - read.setInferredInsertSize(0); - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - read.setInferredInsertSize(10); - - // Test case 6: read is unmapped - read = makeRead(fragmentSize, mateStart); - read.setReadUnmappedFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - read.setReadUnmappedFlag(false); - - // Test case 7: reads don't overlap and look like this: - // <--------| - // |------> - // first read: - read = makeRead(fragmentSize, mateStart); - myStart = 980; - read.setAlignmentStart(myStart); - read.setInferredInsertSize(20); - read.setReadNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - - // second read: - read = makeRead(fragmentSize, mateStart); - myStart = 1000; - read.setAlignmentStart(myStart); - read.setInferredInsertSize(20); - read.setMateAlignmentStart(980); - read.setReadNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - - // Test case 8: read doesn't have proper pair flag set - read = makeRead(fragmentSize, mateStart); - read.setReadPairedFlag(true); - read.setProperPairFlag(false); - Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - - // Test case 9: read and mate have same negative flag setting - for ( final boolean negFlag: Arrays.asList(true, false) ) { - read = makeRead(fragmentSize, mateStart); - read.setAlignmentStart(BEFORE); - read.setReadPairedFlag(true); - read.setProperPairFlag(true); - read.setReadNegativeStrandFlag(negFlag); - read.setMateNegativeStrandFlag(!negFlag); - Assert.assertTrue(get.getAdaptor(read) != ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have succeeded"); - - read = makeRead(fragmentSize, mateStart); - read.setAlignmentStart(BEFORE); - read.setReadPairedFlag(true); - read.setProperPairFlag(true); - read.setReadNegativeStrandFlag(negFlag); - read.setMateNegativeStrandFlag(negFlag); - Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have failed for reads with bad alignment orientation"); - } - } - - @Test (enabled = true) - public void testGetBasesReverseComplement() { - int iterations = 1000; - Random random = GenomeAnalysisEngine.getRandomGenerator(); - while(iterations-- > 0) { - final int l = random.nextInt(1000); - GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); - byte [] original = read.getReadBases(); - byte [] reconverted = new byte[l]; - String revComp = ReadUtils.getBasesReverseComplement(read); - for (int i=0; i reads = new ArrayList(); - for( int readLength = minLength; readLength <= maxLength; readLength++ ) { - reads.add( ReadUtils.createRandomRead( readLength ) ); - } - Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match"); - } - } - - final List reads = new LinkedList(); - Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); - } - - @Test (enabled = true) - public void testReadWithNs() throws FileNotFoundException { - - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - final int readLength = 76; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setCigarString("3M414N1D73M"); - - final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); - Assert.assertEquals(result, 3); - } - - @DataProvider(name = "HasWellDefinedFragmentSizeData") - public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { - final List tests = new LinkedList(); - - // setup a basic read that will work - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10); - read.setReadPairedFlag(true); - read.setProperPairFlag(true); - read.setReadUnmappedFlag(false); - read.setMateUnmappedFlag(false); - read.setAlignmentStart(100); - read.setCigarString("50M"); - read.setMateAlignmentStart(130); - read.setInferredInsertSize(80); - read.setFirstOfPairFlag(true); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - - tests.add( new Object[]{ "basic case", read.clone(), true }); - - { - final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone(); - bad1.setReadPairedFlag(false); - tests.add( new Object[]{ "not paired", bad1, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setProperPairFlag(false); - // we currently don't require the proper pair flag to be set - tests.add( new Object[]{ "not proper pair", bad, true }); -// tests.add( new Object[]{ "not proper pair", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setReadUnmappedFlag(true); - tests.add( new Object[]{ "read is unmapped", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setMateUnmappedFlag(true); - tests.add( new Object[]{ "mate is unmapped", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setMateNegativeStrandFlag(false); - tests.add( new Object[]{ "read and mate both on positive strand", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setReadNegativeStrandFlag(true); - tests.add( new Object[]{ "read and mate both on negative strand", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setInferredInsertSize(0); - tests.add( new Object[]{ "insert size is 0", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setAlignmentStart(1000); - tests.add( new Object[]{ "positve read starts after mate end", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setReadNegativeStrandFlag(true); - bad.setMateNegativeStrandFlag(false); - bad.setMateAlignmentStart(1000); - tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "HasWellDefinedFragmentSizeData") - private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) { - Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java deleted file mode 100644 index 30f112241..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ /dev/null @@ -1,1431 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.variant; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -public class GATKVariantContextUtilsUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - Allele Aref, T, C, G, Cref, ATC, ATCATC; - - @BeforeSuite - public void setup() { - // alleles - Aref = Allele.create("A", true); - Cref = Allele.create("C", true); - T = Allele.create("T"); - C = Allele.create("C"); - G = Allele.create("G"); - ATC = Allele.create("ATC"); - ATCATC = Allele.create("ATCATC"); - } - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); - } - - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).make(); - } - - private VariantContext makeVC(String source, List alleles) { - return makeVC(source, alleles, null, null); - } - - private VariantContext makeVC(String source, List alleles, Genotype... g1) { - return makeVC(source, alleles, Arrays.asList(g1)); - } - - private VariantContext makeVC(String source, List alleles, String filter) { - return makeVC(source, alleles, filter.equals(".") ? null : new HashSet(Arrays.asList(filter))); - } - - private VariantContext makeVC(String source, List alleles, Set filters) { - return makeVC(source, alleles, null, filters); - } - - private VariantContext makeVC(String source, List alleles, Collection genotypes) { - return makeVC(source, alleles, genotypes, null); - } - - private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { - int start = 10; - int stop = start; // alleles.contains(ATC) ? start + 3 : start; - return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); - } - - // -------------------------------------------------------------------------------- - // - // Test allele merging - // - // -------------------------------------------------------------------------------- - - private class MergeAllelesTest extends TestDataProvider { - List> inputs; - List expected; - - private MergeAllelesTest(List... arg) { - super(MergeAllelesTest.class); - LinkedList> all = new LinkedList<>(Arrays.asList(arg)); - expected = all.pollLast(); - inputs = all; - } - - public String toString() { - return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected); - } - } - @DataProvider(name = "mergeAlleles") - public Object[][] mergeAllelesData() { - // first, do no harm - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref), - Arrays.asList(Aref)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, T), - Arrays.asList(Aref, T)); - - new MergeAllelesTest(Arrays.asList(Aref, C), - Arrays.asList(Aref, T), - Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, T), - Arrays.asList(Aref, C), - Arrays.asList(Aref, T, C)); // in order of appearence - - new MergeAllelesTest(Arrays.asList(Aref, C, T), - Arrays.asList(Aref, C), - Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); - new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); - - new MergeAllelesTest(Arrays.asList(Aref, T, C), - Arrays.asList(Aref, C), - Arrays.asList(Aref, T, C)); // in order of appearence - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, ATC), - Arrays.asList(Aref, ATC)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, ATC, ATCATC), - Arrays.asList(Aref, ATC, ATCATC)); - - // alleles in the order we see them - new MergeAllelesTest(Arrays.asList(Aref, ATCATC), - Arrays.asList(Aref, ATC, ATCATC), - Arrays.asList(Aref, ATCATC, ATC)); - - // same - new MergeAllelesTest(Arrays.asList(Aref, ATC), - Arrays.asList(Aref, ATCATC), - Arrays.asList(Aref, ATC, ATCATC)); - - return MergeAllelesTest.getTests(MergeAllelesTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") - public void testMergeAlleles(MergeAllelesTest cfg) { - final List inputs = new ArrayList(); - - int i = 0; - for ( final List alleles : cfg.inputs ) { - final String name = "vcf" + ++i; - inputs.add(makeVC(name, alleles)); - } - - final List priority = vcs2priority(inputs); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - inputs, priority, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false); - - Assert.assertEquals(merged.getAlleles(), cfg.expected); - } - - // -------------------------------------------------------------------------------- - // - // Test rsID merging - // - // -------------------------------------------------------------------------------- - - private class SimpleMergeRSIDTest extends TestDataProvider { - List inputs; - String expected; - - private SimpleMergeRSIDTest(String... arg) { - super(SimpleMergeRSIDTest.class); - LinkedList allStrings = new LinkedList(Arrays.asList(arg)); - expected = allStrings.pollLast(); - inputs = allStrings; - } - - public String toString() { - return String.format("SimpleMergeRSIDTest vc=%s expected=%s", inputs, expected); - } - } - - @DataProvider(name = "simplemergersiddata") - public Object[][] createSimpleMergeRSIDData() { - new SimpleMergeRSIDTest(".", "."); - new SimpleMergeRSIDTest(".", ".", "."); - new SimpleMergeRSIDTest("rs1", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs1", "rs1"); - new SimpleMergeRSIDTest(".", "rs1", "rs1"); - new SimpleMergeRSIDTest("rs1", ".", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates - new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1"); - new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1"); - new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1"); - new SimpleMergeRSIDTest("rs1", ".", ".", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs3", "rs1,rs2,rs3"); - - return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") - public void testRSIDMerge(SimpleMergeRSIDTest cfg) { - VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); - final List inputs = new ArrayList(); - - for ( final String id : cfg.inputs ) { - inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); - } - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - inputs, null, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false); - Assert.assertEquals(merged.getID(), cfg.expected); - } - - // -------------------------------------------------------------------------------- - // - // Test filtered merging - // - // -------------------------------------------------------------------------------- - - private class MergeFilteredTest extends TestDataProvider { - List inputs; - VariantContext expected; - String setExpected; - GATKVariantContextUtils.FilteredRecordMergeType type; - - - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { - this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); - } - - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { - super(MergeFilteredTest.class, name); - LinkedList all = new LinkedList(Arrays.asList(input1, input2)); - this.expected = expected; - this.type = type; - inputs = all; - this.setExpected = setExpected; - } - - public String toString() { - return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); - } - } - - @DataProvider(name = "mergeFiltered") - public Object[][] mergeFilteredData() { - new MergeFilteredTest("AllPass", - makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("noFilters", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "."), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("oneFiltered", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("onePassOneFail", - makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("AllFiltered", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "FAIL"), - GATKVariantContextUtils.MERGE_FILTER_IN_ALL); - - // test ALL vs. ANY - new MergeFilteredTest("FailOneUnfiltered", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "."), - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("OneFailAllUnfilteredArg", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "FAIL"), - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, - String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - // test excluding allele in filtered record - new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - // promotion of site from unfiltered to PASSES - new MergeFilteredTest("UnfilteredPlusPassIsPass", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("RefInAll", - makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_REF_IN_ALL); - - new MergeFilteredTest("RefInOne", - makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - "2"); - - return MergeFilteredTest.getTests(MergeFilteredTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") - public void testMergeFiltered(MergeFilteredTest cfg) { - final List priority = vcs2priority(cfg.inputs); - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); - - // test alleles are equal - Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); - - // test set field - Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); - - // test filter field - Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); - } - - // -------------------------------------------------------------------------------- - // - // Test genotype merging - // - // -------------------------------------------------------------------------------- - - private class MergeGenotypesTest extends TestDataProvider { - List inputs; - VariantContext expected; - List priority; - - private MergeGenotypesTest(String name, String priority, VariantContext... arg) { - super(MergeGenotypesTest.class, name); - LinkedList all = new LinkedList(Arrays.asList(arg)); - this.expected = all.pollLast(); - inputs = all; - this.priority = Arrays.asList(priority.split(",")); - } - - public String toString() { - return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); - } - } - - @DataProvider(name = "mergeGenotypes") - public Object[][] mergeGenotypesData() { - new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); - - new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); - - new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); - - new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); - - new MergeGenotypesTest("PreserveNoCall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); - - new MergeGenotypesTest("PerserveAlleles", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), - makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); - - new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); - - new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); - - // - // merging genothpes with PLs - // - - // first, do no harm - new MergeGenotypesTest("OrderedPLs", "1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles", "1", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); - - new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); - - new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", - makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), - // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); - - new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), - makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), - // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); - - return MergeGenotypesTest.getTests(MergeGenotypesTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") - public void testMergeGenotypes(MergeGenotypesTest cfg) { - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); - - // test alleles are equal - Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); - - // test genotypes - assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); - } - - // necessary to not overload equals for genotypes - private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { - if (actual == expected) { - return; - } - - if (actual == null || expected == null) { - Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual); - } - - if (actual.size() != expected.size()) { - Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); - } - - for (Genotype value : actual) { - Genotype expectedValue = expected.get(value.getSampleName()); - - Assert.assertEquals(value.getAlleles(), expectedValue.getAlleles(), "Alleles in Genotype aren't equal"); - Assert.assertEquals(value.getGQ(), expectedValue.getGQ(), "GQ values aren't equal"); - Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); - if ( value.hasLikelihoods() ) - Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); - } - } - - @Test(enabled = !DEBUG) - public void testMergeGenotypesUniquify() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false); - - // test genotypes - Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); - } - -// TODO: remove after testing -// @Test(expectedExceptions = IllegalStateException.class) -// public void testMergeGenotypesRequireUnique() { -// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); -// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); -// -// final VariantContext merged = VariantContextUtils.simpleMerge( -// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); -// } - - // -------------------------------------------------------------------------------- - // - // Misc. tests - // - // -------------------------------------------------------------------------------- - - @Test(enabled = !DEBUG) - public void testAnnotationSet() { - for ( final boolean annotate : Arrays.asList(true, false)) { - for ( final String set : Arrays.asList("set", "combine", "x")) { - final List priority = Arrays.asList("1", "2"); - VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false); - - if ( annotate ) - Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); - else - Assert.assertFalse(merged.hasAttribute(set)); - } - } - } - - private static final List vcs2priority(final Collection vcs) { - final List priority = new ArrayList<>(); - - for ( final VariantContext vc : vcs ) { - priority.add(vc.getSource()); - } - - return priority; - } - - // -------------------------------------------------------------------------------- - // - // basic allele clipping test - // - // -------------------------------------------------------------------------------- - - private class ReverseClippingPositionTestProvider extends TestDataProvider { - final String ref; - final List alleles = new ArrayList(); - final int expectedClip; - - private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { - super(ReverseClippingPositionTestProvider.class); - this.ref = ref; - for ( final String allele : alleles ) - this.alleles.add(Allele.create(allele)); - this.expectedClip = expectedClip; - } - - @Override - public String toString() { - return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); - } - } - - @DataProvider(name = "ReverseClippingPositionTestProvider") - public Object[][] makeReverseClippingPositionTestProvider() { - // pair clipping - new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); - new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); - new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); - new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele - - // triplets - new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); - new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go - new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go - - return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); - } - - @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") - public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { - int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); - Assert.assertEquals(result, cfg.expectedClip); - } - - - // -------------------------------------------------------------------------------- - // - // test splitting into bi-allelics - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "SplitBiallelics") - public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { - List tests = new ArrayList(); - - final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); - - // biallelic -> biallelic - tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); - - // monos -> monos - root.alleles(Arrays.asList(Aref)); - tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); - - root.alleles(Arrays.asList(Aref, C, T)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Aref, C)).make(), - root.alleles(Arrays.asList(Aref, T)).make())}); - - root.alleles(Arrays.asList(Aref, C, T, G)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Aref, C)).make(), - root.alleles(Arrays.asList(Aref, T)).make(), - root.alleles(Arrays.asList(Aref, G)).make())}); - - final Allele C = Allele.create("C"); - final Allele CA = Allele.create("CA"); - final Allele CAA = Allele.create("CAA"); - final Allele CAAAA = Allele.create("CAAAA"); - final Allele CAAAAA = Allele.create("CAAAAA"); - final Allele Cref = Allele.create("C", true); - final Allele CAref = Allele.create("CA", true); - final Allele CAAref = Allele.create("CAA", true); - final Allele CAAAref = Allele.create("CAAA", true); - - root.alleles(Arrays.asList(Cref, CA, CAA)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Cref, CA)).make(), - root.alleles(Arrays.asList(Cref, CAA)).make())}); - - root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(CAAref, C)).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); - - root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(CAAAref, C)).make(), - root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); - - root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), - root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make(), - root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); - - final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); - final Allele twoCopies = Allele.create("GTTTTATTTTA", true); - final Allele zeroCopies = Allele.create("G", false); - final Allele oneCopies = Allele.create("GTTTTA", false); - tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), - Arrays.asList( - root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), - root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") - public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); - Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); - for ( int i = 0; i < biallelics.size(); i++ ) { - final VariantContext actual = biallelics.get(i); - final VariantContext expected = expectedBiallelics.get(i); - assertVariantContextsAreEqual(actual, expected); - } - } - - @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") - public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List genotypes = new ArrayList(); - - int sampleI = 0; - for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { - genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); - } - genotypes.add(GenotypeBuilder.createMissing("missing", 2)); - - final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); - - final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); - for ( int i = 0; i < biallelics.size(); i++ ) { - final VariantContext actual = biallelics.get(i); - Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples - - for ( final Genotype inputGenotype : genotypes ) { - final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); - Assert.assertNotNull(actualGenotype); - if ( ! vc.isVariant() || vc.isBiallelic() ) - Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); - else - Assert.assertTrue(actualGenotype.isNoCall()); - } - } - } - - // -------------------------------------------------------------------------------- - // - // Test repeats - // - // -------------------------------------------------------------------------------- - - private class RepeatDetectorTest extends TestDataProvider { - String ref; - boolean isTrueRepeat; - VariantContext vc; - - private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { - super(RepeatDetectorTest.class); - this.isTrueRepeat = isTrueRepeat; - this.ref = ref; - - List alleles = new LinkedList(); - final Allele refAllele = Allele.create(refAlleleString, true); - alleles.add(refAllele); - for ( final String altString: altAlleleStrings) { - final Allele alt = Allele.create(altString, false); - alleles.add(alt); - } - - VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); - this.vc = builder.make(); - } - - public String toString() { - return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); - } - } - - @DataProvider(name = "RepeatDetectorTest") - public Object[][] makeRepeatDetectorTest() { - new RepeatDetectorTest(true, "NAAC", "N", "NA"); - new RepeatDetectorTest(true, "NAAC", "NA", "N"); - new RepeatDetectorTest(false, "NAAC", "NAA", "N"); - new RepeatDetectorTest(false, "NAAC", "N", "NC"); - new RepeatDetectorTest(false, "AAC", "A", "C"); - - // running out of ref bases => false - new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); - - // complex repeats - new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); - - // multi-allelic - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false - - return RepeatDetectorTest.getTests(RepeatDetectorTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") - public void testRepeatDetectorTest(RepeatDetectorTest cfg) { - - // test alleles are equal - Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); - } - - @Test(enabled = !DEBUG) - public void testRepeatAllele() { - Allele nullR = Allele.create("A", true); - Allele nullA = Allele.create("A", false); - Allele atc = Allele.create("AATC", false); - Allele atcatc = Allele.create("AATCATC", false); - Allele ccccR = Allele.create("ACCCC", true); - Allele cc = Allele.create("ACC", false); - Allele cccccc = Allele.create("ACCCCCC", false); - Allele gagaR = Allele.create("AGAGA", true); - Allele gagagaga = Allele.create("AGAGAGAGA", false); - - // - / ATC [ref] from 20-22 - String delLoc = "chr1"; - int delLocStart = 20; - int delLocStop = 22; - - // - [ref] / ATC from 20-20 - String insLoc = "chr1"; - int insLocStart = 20; - int insLocStop = 20; - - Pair,byte[]> result; - byte[] refBytes = "TATCATCATCGGA".getBytes(); - - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); - - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); - - - // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 - VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,3); - - // ATC*,A,ATCATC - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],2); - Assert.assertEquals(result.getFirst().toArray()[2],4); - Assert.assertEquals(result.getSecond().length,3); - - // simple non-tandem deletion: CCCC*, - - refBytes = "TCCCCCCCCATG".getBytes(); - vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],8); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,1); - - // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 - refBytes = "TCCCCCCCAGAGAGAG".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],7); - Assert.assertEquals(result.getFirst().toArray()[1],5); - Assert.assertEquals(result.getFirst().toArray()[2],3); - Assert.assertEquals(result.getFirst().toArray()[3],9); - Assert.assertEquals(result.getSecond().length,1); - - // GAGA*,-,GAGAGAGA - refBytes = "TGAGAGAGAGATTT".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],5); - Assert.assertEquals(result.getFirst().toArray()[1],3); - Assert.assertEquals(result.getFirst().toArray()[2],7); - Assert.assertEquals(result.getSecond().length,2); - - } - - // -------------------------------------------------------------------------------- - // - // test forward clipping - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "ForwardClippingData") - public Object[][] makeForwardClippingData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList("A"), -1}); - tests.add(new Object[]{Arrays.asList(""), -1}); - tests.add(new Object[]{Arrays.asList("A", "C"), -1}); - tests.add(new Object[]{Arrays.asList("AC", "C"), -1}); - tests.add(new Object[]{Arrays.asList("A", "G"), -1}); - tests.add(new Object[]{Arrays.asList("A", "T"), -1}); - tests.add(new Object[]{Arrays.asList("GT", "CA"), -1}); - tests.add(new Object[]{Arrays.asList("GT", "CT"), -1}); - tests.add(new Object[]{Arrays.asList("ACC", "AC"), 0}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), 2}); - tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), 0}); - tests.add(new Object[]{Arrays.asList("A", ""), -1}); - for ( int len = 0; len < 50; len++ ) - tests.add(new Object[]{Arrays.asList("A" + new String(Utils.dupBytes((byte)'C', len)), "C"), -1}); - - tests.add(new Object[]{Arrays.asList("A", "T", "C"), -1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), 0}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "A"), -1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("AC", "AC", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), 1}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), 1}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") - public void testForwardClipping(final List alleleStrings, final int expectedClip) { - final List alleles = new LinkedList(); - for ( final String alleleString : alleleStrings ) - alleles.add(Allele.create(alleleString)); - - for ( final List myAlleles : Utils.makePermutations(alleles, alleles.size(), false)) { - final int actual = GATKVariantContextUtils.computeForwardClipping(myAlleles); - Assert.assertEquals(actual, expectedClip); - } - } - - @DataProvider(name = "ClipAlleleTest") - public Object[][] makeClipAlleleTest() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList("ACC", "AC"), Arrays.asList("AC", "A"), 0}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), Arrays.asList("GC", "G"), 2}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), Arrays.asList("C", "A"), 3}); - tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), Arrays.asList("AC", "A"), 0}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), Arrays.asList("T", "C", "G"), 1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), Arrays.asList("T", "C", "CG"), 1}); - tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), Arrays.asList("C", "CT", "CG"), 1}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), Arrays.asList("G", "GT", "GTA"), 2}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), Arrays.asList("G", "GT", "GCA"), 2}); - - // trims from left and right - tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); - tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); - tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") - public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { - final int start = 10; - final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); - final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true); - - Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped); - for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) { - final Allele trimmed = clipped.getAlleles().get(i); - Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); - } - } - - // -------------------------------------------------------------------------------- - // - // test primitive allele splitting - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "PrimitiveAlleleSplittingData") - public Object[][] makePrimitiveAlleleSplittingData() { - List tests = new ArrayList<>(); - - // no split - tests.add(new Object[]{"A", "C", 0, null}); - tests.add(new Object[]{"A", "AC", 0, null}); - tests.add(new Object[]{"AC", "A", 0, null}); - - // one split - tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); - tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); - tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); - - // two splits - tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); - tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); - tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); - - // three splits - tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") - public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { - - final int start = 10; - final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); - - final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); - - if ( expectedSplit > 0 ) { - Assert.assertEquals(result.size(), expectedSplit); - for ( int i = 0; i < variantPositions.size(); i++ ) { - Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); - } - } else { - Assert.assertEquals(result.size(), 1); - Assert.assertEquals(vc, result.get(0)); - } - } - - @Test(enabled = !DEBUG) - public void testFillInNonRefSymbolicAlleles() { - final int start = 10; - final String ref = "A"; - final String alt = "C"; - final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); - final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">")); - - List VCs = Arrays.asList(vcAlt, vcRef); - VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.emptyList()); - - // make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele - for( final VariantContext vc : VCs ) { - Assert.assertTrue(vc.getAlternateAlleles().size() == 1); - Assert.assertTrue(vc.getAlternateAllele(0).isNonReference()); - Assert.assertTrue(!vc.getReference().isSymbolic()); - Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic()); - } - } - - // -------------------------------------------------------------------------------- - // - // test allele remapping - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "AlleleRemappingData") - public Object[][] makeAlleleRemappingData() { - List tests = new ArrayList<>(); - - final Allele originalBase1 = Allele.create((byte)'A'); - final Allele originalBase2 = Allele.create((byte)'T'); - - for ( final byte base1 : BaseUtils.BASES ) { - for ( final byte base2 : BaseUtils.BASES ) { - for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { - Map map = new HashMap<>(2); - map.put(originalBase1, Allele.create(base1)); - map.put(originalBase2, Allele.create(base2)); - - tests.add(new Object[]{map, numGenotypes}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") - public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { - - final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); - - final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); - - final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); - - for ( int i = 0; i < numGenotypes; i++ ) { - final Genotype originalG = originalGC.get(String.format("%d", i)); - final Genotype remappedG = remappedGC.get(String.format("%d", i)); - - Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); - for ( int j = 0; j < originalG.getAlleles().size(); j++ ) - Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); - } - } - - private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { - GenomeAnalysisEngine.resetRandomGenerator(); - final Random random = GenomeAnalysisEngine.getRandomGenerator(); - - final GenotypesContext gc = GenotypesContext.create(); - for ( int i = 0; i < numGenotypes; i++ ) { - // choose alleles at random - final List myAlleles = new ArrayList(); - myAlleles.add(alleles.get(random.nextInt(2))); - myAlleles.add(alleles.get(random.nextInt(2))); - - final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); - gc.add(g); - } - - return gc; - } - - // -------------------------------------------------------------------------------- - // - // Test subsetDiploidAlleles - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "subsetDiploidAllelesData") - public Object[][] makesubsetDiploidAllelesData() { - List tests = new ArrayList<>(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - - final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); - - final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); - final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); - final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); - final double[] uninformative = new double[]{0, 0, 0}; - - final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); - - // make sure we don't screw up the simple case - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); - - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).noAD().make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).noAD().make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).noAD().make())}); - - // uninformative test case - final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); - - // actually subsetting down from multiple alt values - final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; - final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; - final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; - final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homRef3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).noAD().GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefC3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).noAD().GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homC3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).noAD().GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).noAD().GQ(200).make())}); - - // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetCG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).noAD().GQ(200).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).noAD().GQ(200).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "subsetDiploidAllelesData") - public void testsubsetDiploidAllelesData(final VariantContext inputVC, - final List allelesToUse, - final List expectedGenotypes) { - final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); - - Assert.assertEquals(actual.size(), expectedGenotypes.size()); - for ( final Genotype expected : expectedGenotypes ) { - final Genotype actualGT = actual.get(expected.getSampleName()); - Assert.assertNotNull(actualGT); - assertGenotypesAreEqual(actualGT, expected); - } - } - - @DataProvider(name = "UpdateGenotypeAfterSubsettingData") - public Object[][] makeUpdateGenotypeAfterSubsettingData() { - List tests = new ArrayList(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); - - final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; - final double[] hetPL = new double[]{0.09, 0.9, 0.01}; - final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; - final double[] uninformative = new double[]{0.33, 0.33, 0.33}; - final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); - - for ( final List alleles : allSubsetAlleles ) { - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); - } - } - - for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); -// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); - } - - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") - public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, - final double[] likelihoods, - final List originalGT, - final List allelesToUse, - final List expectedAlleles) { - final GenotypeBuilder gb = new GenotypeBuilder("test"); - final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); - GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); - final Genotype g = gb.make(); - Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); - } - - @Test(enabled = !DEBUG) - public void testSubsetToRef() { - final Map tests = new LinkedHashMap<>(); - - for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { - for ( final String name : Arrays.asList("test1", "test2") ) { - final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); - builder.DP(10); - builder.GQ(30); - builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); - builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); - final List refs = Collections.nCopies(alleles.size(), Aref); - tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); - } - } - - for ( final int n : Arrays.asList(1, 2, 3) ) { - for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { - final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); - final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); - - Assert.assertEquals(gc.size(), genotypes.size()); - for ( int i = 0; i < genotypes.size(); i++ ) { -// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); - assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); - } - } - } - } - - // -------------------------------------------------------------------------------- - // - // Test updatePLsAndAD - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "updatePLsAndADData") - public Object[][] makeUpdatePLsAndADData() { - List tests = new ArrayList<>(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - - final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); - - final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); - final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); - final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); - final double[] uninformative = new double[]{0, 0, 0}; - - final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); - - // make sure we don't screw up the simple case where no selection happens - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); - - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); - - // uninformative test cases - final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); - - // actually subsetting down from multiple alt values - final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; - final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; - final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; - final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG - - final int[] homRef3AllelesAD = new int[]{20, 0, 1}; - final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; - final int[] homC3AllelesAD = new int[]{0, 20, 1}; - final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; - final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG - final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "updatePLsAndADData") - public void testUpdatePLsAndADData(final VariantContext originalVC, - final VariantContext selectedVC, - final List expectedGenotypes) { - final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); - final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); - - Assert.assertEquals(actual.size(), expectedGenotypes.size()); - for ( final Genotype expected : expectedGenotypes ) { - final Genotype actualGT = actual.get(expected.getSampleName()); - Assert.assertNotNull(actualGT); - assertGenotypesAreEqual(actualGT, expected); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java deleted file mode 100644 index a1b75a3f1..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ /dev/null @@ -1,377 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.variant; - -import com.google.caliper.Param; -import com.google.caliper.SimpleBenchmark; -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFCodec; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -/** - * Caliper microbenchmark of parsing a VCF file - */ -public class VariantContextBenchmark extends SimpleBenchmark { - @Param({"/Users/depristo/Desktop/broadLocal/localData/ALL.chr20.merged_beagle_mach.20101123.snps_indels_svs.genotypes.vcf"}) - String vcfFile; - - @Param({"1000"}) - int linesToRead; // set automatically by framework - - @Param({"100"}) - int nSamplesToTake; // set automatically by framework - - @Param({"10"}) - int dupsToMerge; // set automatically by framework - - @Param - Operation operation; // set automatically by framework - - private String INPUT_STRING; - - public enum Operation { - READ, - SUBSET_TO_SAMPLES, - GET_TYPE, - GET_ID, - GET_GENOTYPES, - GET_ATTRIBUTE_STRING, - GET_ATTRIBUTE_INT, - GET_N_SAMPLES, - GET_GENOTYPES_FOR_SAMPLES, - GET_GENOTYPES_IN_ORDER_OF_NAME, - CALC_GENOTYPE_COUNTS, - MERGE - } - - @Override protected void setUp() { - // TODO -- update for new tribble interface -// try { -// ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference)); -// b37GenomeLocParser = new GenomeLocParser(seq); -// } catch ( FileNotFoundException e) { -// throw new RuntimeException(e); -// } -// -// // read it into a String so that we don't try to benchmark IO issues -// try { -// FileInputStream s = new FileInputStream(new File(vcfFile)); -// AsciiLineReader lineReader = new AsciiLineReader(s); -// int counter = 0; -// StringBuffer sb = new StringBuffer(); -// while (counter++ < linesToRead ) { -// String line = lineReader.readLine(); -// if ( line == null ) -// break; -// sb.append(line + "\n"); -// } -// s.close(); -// INPUT_STRING = sb.toString(); -// } catch (IOException e) { -// throw new RuntimeException(e); -// } - } - - private interface FunctionToBenchmark { - public void run(T vc); - } - - private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { - // TODO -- update for new Tribble interface -// try { -// InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes()); -// AsciiLineReader lineReader = new AsciiLineReader(is); -// codec.readHeader(lineReader); -// -// int counter = 0; -// while (counter++ < linesToRead ) { -// String line = lineReader.readLine(); -// if ( line == null ) -// break; -// -// T vc = codec.decode(line); -// func.run(vc); -// } -// } catch (Exception e) { -// System.out.println("Benchmarking run failure because of " + e.getMessage()); -// } - } - - public void timeV14(int rep) { - for ( int i = 0; i < rep; i++ ) { - FunctionToBenchmark func = getV14FunctionToBenchmark(); - final VCFCodec codec = new VCFCodec(); - runBenchmark(codec, func); - } - } - - public FunctionToBenchmark getV14FunctionToBenchmark() { - switch ( operation ) { - case READ: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - ; // empty operation - } - }; - case SUBSET_TO_SAMPLES: - return new FunctionToBenchmark() { - Set samples; - public void run(final VariantContext vc) { - if ( samples == null ) - samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); - VariantContext sub = vc.subContextFromSamples(samples); - sub.getNSamples(); - } - }; - case GET_TYPE: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getType(); - } - }; - case GET_ID: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getID(); - } - }; - case GET_GENOTYPES: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getGenotypes().size(); - } - }; - - case GET_GENOTYPES_FOR_SAMPLES: - return new FunctionToBenchmark() { - Set samples; - public void run(final VariantContext vc) { - if ( samples == null ) - samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); - vc.getGenotypes(samples).size(); - } - }; - - case GET_ATTRIBUTE_STRING: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getAttribute("AN", null); - } - }; - - case GET_ATTRIBUTE_INT: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getAttributeAsInt("AC", 0); - } - }; - - case GET_N_SAMPLES: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getNSamples(); - } - }; - - case GET_GENOTYPES_IN_ORDER_OF_NAME: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - ; // TODO - TEST IS BROKEN -// int n = 0; -// for ( final Genotype g: vc.getGenotypesOrderedByName() ) n++; - } - }; - - case CALC_GENOTYPE_COUNTS: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - vc.getHetCount(); - } - }; - - case MERGE: - return new FunctionToBenchmark() { - public void run(final VariantContext vc) { - List toMerge = new ArrayList<>(); - - for ( int i = 0; i < dupsToMerge; i++ ) { - GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - gc.add(new GenotypeBuilder(g).name(g.getSampleName()+"_"+i).make()); - } - toMerge.add(new VariantContextBuilder(vc).genotypes(gc).make()); - } - - GATKVariantContextUtils.simpleMerge(toMerge, null, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true, false); - } - }; - - default: throw new IllegalArgumentException("Unexpected operation " + operation); - } - } - - // -------------------------------------------------------------------------------- - // - // V13 - // - // In order to use this, you must move the v13 version from archive and uncomment - // - // git mv private/archive/java/src/org/broadinstitute/sting/utils/variantcontext/v13 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13 - // - // -------------------------------------------------------------------------------- - -// public void timeV13(int rep) { -// for ( int i = 0; i < rep; i++ ) { -// FunctionToBenchmark func = getV13FunctionToBenchmark(); -// FeatureCodec codec = new org.broadinstitute.variant.variantcontext.v13.VCFCodec(); -// runBenchmark(codec, func); -// } -// } -// -// public FunctionToBenchmark getV13FunctionToBenchmark() { -// switch ( operation ) { -// case READ: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// ; // empty operation -// } -// }; -// case SUBSET_TO_SAMPLES: -// return new FunctionToBenchmark() { -// List samples; -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// if ( samples == null ) -// samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); -// org.broadinstitute.variant.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); -// sub.getNSamples(); -// } -// }; -// -// case GET_TYPE: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getType(); -// } -// }; -// case GET_ID: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getID(); -// } -// }; -// case GET_GENOTYPES: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getGenotypes().size(); -// } -// }; -// -// case GET_GENOTYPES_FOR_SAMPLES: -// return new FunctionToBenchmark() { -// Set samples; -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// if ( samples == null ) -// samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); -// vc.getGenotypes(samples).size(); -// } -// }; -// -// case GET_ATTRIBUTE_STRING: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getExtendedAttribute("AN", null); -// } -// }; -// -// case GET_ATTRIBUTE_INT: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getAttributeAsInt("AC", 0); -// } -// }; -// -// case GET_N_SAMPLES: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getNSamples(); -// } -// }; -// -// case GET_GENOTYPES_IN_ORDER_OF_NAME: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// ; // TODO - TEST IS BROKEN -// //vc.getGenotypesOrderedByName(); -// } -// }; -// -// case CALC_GENOTYPE_COUNTS: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// vc.getHetCount(); -// } -// }; -// -// case MERGE: -// return new FunctionToBenchmark() { -// public void run(final org.broadinstitute.variant.variantcontext.v13.VariantContext vc) { -// List toMerge = new ArrayList(); -// -// for ( int i = 0; i < dupsToMerge; i++ ) { -// Map gc = new HashMap(); -// for ( final org.broadinstitute.variant.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { -// String name = g.getSampleName()+"_"+i; -// gc.put(name, new org.broadinstitute.variant.variantcontext.v13.Genotype(name, -// g.getAlleles(), g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); -// toMerge.add(org.broadinstitute.variant.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); -// } -// } -// -// org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, -// toMerge, null, -// org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, -// true, false, "set", false, true, false); -// } -// }; -// -// default: throw new IllegalArgumentException("Unexpected operation " + operation); -// } -// } - - public static void main(String[] args) { - com.google.caliper.Runner.main(VariantContextBenchmark.class, args); - } -} diff --git a/public/package-tests/pom.xml b/public/package-tests/pom.xml new file mode 100644 index 000000000..0eda2ae00 --- /dev/null +++ b/public/package-tests/pom.xml @@ -0,0 +1,201 @@ + + + 4.0.0 + + + + + org.broadinstitute.sting + sting-root + 3.0 + ../sting-root + + + sting-package-tests + pom + Sting Package Tests + + + ${project.basedir}/../.. + true + true + true + true + true + + + + + + ${project.groupId} + ${sting.packagetests.artifactId} + ${project.version} + + + + + com.sun + tools + + + + com.google.code.cofoja + cofoja + + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + * + * + + + + + + + org.testng + testng + test + + + + com.google.caliper + caliper + test + + + + + ${sting.packagetests.basedir}/target + + + + + unittests + + false + + unittests.profile.enabled + true + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + ${sting.packagetests.basedir} + ${sting.packagetests.basedir} + ${project.build.outputDirectory}/ignored_by_package_test + ${sting.packagetests.testClasses} + + + + unit-tests + + test + + + ${sting.packageunittests.skipped} + + + + + + + + + + integrationtests + + false + + integrationtests.profile.enabled + true + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + ${sting.packagetests.basedir} + ${sting.packagetests.basedir} + ${project.build.outputDirectory}/ignored_by_package_test + ${sting.packagetests.testClasses} + + + + integration-tests + + verify + + + + ${sting.packageintegrationtests.skipped} + + + + pipeline-tests + + verify + + + + ${sting.packagepipelinetests.skipped} + + + + large-scale-tests + + verify + + + + ${sting.packagelargescaletests.skipped} + + + + knowledge-base-tests + + verify + + + + ${sting.packageknowledgebasetests.skipped} + + + + + + + + + + + diff --git a/public/packages/Aligner.xml b/public/packages/Aligner.xml deleted file mode 100644 index 031dfacfd..000000000 --- a/public/packages/Aligner.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/public/packages/CreatePackager.xsl b/public/packages/CreatePackager.xsl deleted file mode 100644 index a89b6bb35..000000000 --- a/public/packages/CreatePackager.xsl +++ /dev/null @@ -1,220 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/GATKEngine.xml b/public/packages/GATKEngine.xml deleted file mode 100644 index 08d2e1c2c..000000000 --- a/public/packages/GATKEngine.xml +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/GenomeAnalysisTK.xml b/public/packages/GenomeAnalysisTK.xml deleted file mode 100644 index e95c992b6..000000000 --- a/public/packages/GenomeAnalysisTK.xml +++ /dev/null @@ -1,40 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/public/packages/PicardPrivate.xml b/public/packages/PicardPrivate.xml deleted file mode 100644 index d898a5d07..000000000 --- a/public/packages/PicardPrivate.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/public/packages/Queue.xml b/public/packages/Queue.xml deleted file mode 100644 index 621a549d5..000000000 --- a/public/packages/Queue.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/QueueEngine.xml b/public/packages/QueueEngine.xml deleted file mode 100644 index af3e20219..000000000 --- a/public/packages/QueueEngine.xml +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/pom.xml b/public/pom.xml new file mode 100644 index 000000000..40560cfbf --- /dev/null +++ b/public/pom.xml @@ -0,0 +1,47 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-root + 3.0 + sting-root + + + sting-public + pom + Sting Public + + + sting-root + gsalib + sting-utils + gatk-framework + gatk-package + + + + + + ${project.basedir}/.. + + + + + + queue + + + !disable.queue + + + + gatk-queue-extgen + queue-framework + queue-package + + + + + diff --git a/public/queue-framework/pom.xml b/public/queue-framework/pom.xml new file mode 100644 index 000000000..2accd1175 --- /dev/null +++ b/public/queue-framework/pom.xml @@ -0,0 +1,270 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + queue-framework + jar + Queue Framework + + + ${project.basedir}/../.. + ${project.build.directory}/generated-sources/gatk-extensions + false + queue-package + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + org.scala-lang + scala-compiler + + + log4j + log4j + + + net.sf.jgrapht + jgrapht + + + org.apache.commons + commons-email + + + javax.mail + mail + + + + ${project.groupId} + gatk-queue-extgen + ${project.version} + runtime + + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + + + + + + org.codehaus.mojo + exec-maven-plugin + + + generate-gatk-extensions + + exec + + generate-sources + + ${sting.generate-gatk-extensions.skipped} + java + + -classpath + + org.broadinstitute.sting.queue.extensions.gatk.GATKExtensionsGenerator + -l + WARN + -outDir + ${gatk.extensions.sources} + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-gatk-extensions + + add-source + + generate-sources + + + ${gatk.extensions.sources} + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + ${sting.generate-resources.phase} + + + + + org.scala-tools + maven-scala-plugin + + + com.pyx4j + maven-junction-plugin + + + link-public-qscript + process-test-resources + + + unlink-public-qscript + clean + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-resource-bundle-log4j + prepare-package + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + prepare-package + + + + + org.apache.maven.plugins + maven-invoker-plugin + + + package-unittests + + + package-integrationtests + + + package-largescaletests + + + package-knowledgebasetests + + + package-pipelinetests + + + + + + + + + protected + + + ${basedir}/../../protected/gatk-protected/pom.xml + + + + + ${project.groupId} + gatk-protected + ${project.version} + true + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + ${project.groupId} + gatk-private + ${project.version} + true + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + link-private-qscript + process-test-resources + + + unlink-private-qscript + clean + + + + + + + + + diff --git a/public/queue-framework/src/main/assembly/example-resources.xml b/public/queue-framework/src/main/assembly/example-resources.xml new file mode 100644 index 000000000..7d4ec43ef --- /dev/null +++ b/public/queue-framework/src/main/assembly/example-resources.xml @@ -0,0 +1,20 @@ + + example-resources + + tar.bz2 + + false + + + src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples + . + + ExampleCountReads.scala + ExampleCountLoci.scala + ExampleUnifiedGenotyper.scala + ExampleReadFilter.scala + ExampleCustomWalker.scala + + + + diff --git a/public/java/src/org/broadinstitute/sting/queue/QueueVersion.java b/public/queue-framework/src/main/java/org/broadinstitute/sting/queue/QueueVersion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/QueueVersion.java rename to public/queue-framework/src/main/java/org/broadinstitute/sting/queue/QueueVersion.java diff --git a/public/java/src/org/broadinstitute/sting/queue/package-info.java b/public/queue-framework/src/main/java/org/broadinstitute/sting/queue/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/package-info.java rename to public/queue-framework/src/main/java/org/broadinstitute/sting/queue/package-info.java diff --git a/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala new file mode 100644 index 000000000..9bb031c38 --- /dev/null +++ b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -0,0 +1,532 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.qscripts.CNV + +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.util.VCF_BAM_utilities +import org.broadinstitute.sting.queue.util.DoC._ +import org.broadinstitute.sting.commandline.Hidden +import java.io.{PrintStream, PrintWriter} +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ +import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils + +class xhmmCNVpipeline extends QScript { + qscript => + + @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) + var bams: File = _ + + @Input(doc = "gatk jar file", shortName = "J", required = true) + var gatkJarFile: File = _ + + @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) + var xhmmExec: File = _ + + @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) + var pseqExec: File = _ + + @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) + var pseqSeqDB: String = _ + + @Input(shortName = "R", doc = "ref", required = true) + var referenceFile: File = _ + + @Input(shortName = "L", doc = "Intervals", required = false) + var intervals: File = _ + + @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) + var scatterCountInput = 0 + + @Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) + var samplesPerJob = 1 + + @Output(doc = "Base name for files to output", shortName = "o", required = true) + var outputBase: File = _ + + @Hidden + @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) + var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS + + @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) + var MAX_DEPTH = 20000 + + @Hidden + @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) + var NUM_BINS = 200 + + @Hidden + @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) + var START_BIN = 1 + + @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) + var minMappingQuality = 0 + + @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false) + var minBaseQuality = 0 + + @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) + var wholeMatrixMemory = -1 + + @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) + var minTargGC : Double = 0.1 + + @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false) + var maxTargGC : Double = 0.9 + + @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false) + var minTargRepeats : Double = 0.0 + + @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false) + var maxTargRepeats : Double = 0.1 + + @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false) + var sampleIDsMap: String = "" + + @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false) + var sampleIDsMapFromColumn = 1 + + @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false) + var sampleIDsMapToColumn = 2 + + @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false) + var targetSampleFiltersString: String = "" + + @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false) + var PCAnormalizeMethodString: String = "" + + @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false) + var targetSampleNormalizedFiltersString: String = "" + + @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) + var xhmmParamsArg: File = _ + + @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false) + var discoverCommandLineParams: String = "" + + @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) + var genotypeCommandLineParams: String = "" + + @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false) + var genotypeSubsegments: Boolean = false + + @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false) + var maxTargetsInSubsegment = 30 + + @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) + var subsegmentGenotypeThreshold = 20.0 + + @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) + var longJobQueue: String = "" + + + val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list" + + val RD_OUTPUT_SUFFIX: String = ".RD.txt" + + val TARGS_GC_SUFFIX = ".locus_GC.txt" + val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt" + + val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt" + val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt" + + val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt" + val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt" + + + trait WholeMatrixMemoryLimit extends CommandLineFunction { + // Since loading ALL of the data can take significant memory: + if (wholeMatrixMemory < 0) { + this.memoryLimit = 24 + } + else { + this.memoryLimit = wholeMatrixMemory + } + } + + trait LongRunTime extends CommandLineFunction { + if (longJobQueue != "") + this.jobQueue = longJobQueue + } + + def script = { + val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile) + add(prepTargets) + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= prepTargets.out + this.jarFile = qscript.gatkJarFile + this.reference_sequence = qscript.referenceFile + this.logging_level = "INFO" + } + + val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) + val samples: List[String] = sampleToBams.keys.toList + Console.out.printf("Samples are %s%n", samples) + + val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase) + var docs: List[DoC] = List[DoC]() + for (group <- groups) { + Console.out.printf("Group is %s%n", group) + docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs + } + addAll(docs) + + val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit with LongRunTime + add(mergeDepths) + + var excludeTargets : List[File] = List[File]() + if (minTargGC > 0 || maxTargGC < 1) { + val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs + calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX + add(calcGCcontents) + + val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC) + add(excludeTargetsBasedOnGC) + excludeTargets ::= excludeTargetsBasedOnGC.out + } + + class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction { + @Input(doc="") + var intervals: File = prepTargets.out + + @Output(doc="") + var out : File = new File(outFile) + + val regFile : String = outputBase.getPath + ".targets.reg" + val locDB : String = outputBase.getPath + ".targets.LOCDB" + + val removeFiles = "rm -f " + regFile + " " + locDB + val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile + val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load" + val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats" + val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out + + var command: String = + removeFiles + + " && " + createRegFile + + " && " + createLOCDB + + " && " + calcRepeatMaskedPercent + + " && " + extractRepeatMaskedPercent + + def commandLine = command + + override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command + } + + if (minTargRepeats > 0 || maxTargRepeats < 1) { + val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX) + add(calcRepeatComplexity) + + val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats) + add(excludeTargetsBasedOnRepeats) + excludeTargets ::= excludeTargetsBasedOnRepeats.out + } + + val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets) + add(filterCenterDepths) + + val pca = new PCA(filterCenterDepths.filteredCentered) + add(pca) + + val normalize = new Normalize(pca) + add(normalize) + + val filterZscore = new FilterAndZscoreNormalized(normalize.normalized) + add(filterZscore) + + val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore) + add(filterOriginal) + + val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) + add(discover) + + val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotype) + + if (genotypeSubsegments) { + val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotypeSegs) + } + } + + class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { + @Input(doc="") + var locus_value : File = locus_valueIn + + @Output(doc="") + var out : File = new File(outputBase.getPath + outSuffix) + + def run = { + var outWriter = new PrintWriter(new PrintStream(out)) + var elems = asScalaIterator(new XReadLines(locus_value)) + + while (elems.hasNext) { + val line = elems.next + val splitLine = line.split("\\s+") + val locus = splitLine(0) + val locValStr = splitLine(1) + try { + val locVal = locValStr.toDouble + if (locVal < minVal || locVal > maxVal) + outWriter.printf("%s%n", locus) + } + catch { + case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus) + case e: Exception => throw e + } + } + + outWriter.close + } + } + + class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val excludeTargets = excludeTargetsIn + + @Output + val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType target" + + " -o " + filteredCentered + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + if (targetSampleFiltersString != "") + command += " " + targetSampleFiltersString + + def commandLine = command + + override def description = "Filters samples and targets and then mean-centers the targets: " + command + } + + class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + val PCAbase: String = outputBase.getPath + ".RD_PCA" + + @Output + val outPC: File = new File(PCAbase + ".PC.txt") + @Output + val outPC_SD: File = new File(PCAbase + ".PC_SD.txt") + @Output + val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt") + + var command: String = + xhmmExec + " --PCA" + + " -r " + input + + " --PCAfiles " + PCAbase + + def commandLine = command + + override def description = "Runs PCA on mean-centered data: " + command + } + + class Normalize(pca: PCA) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = pca.input + + @Input(doc = "") + val inPC = pca.outPC + + @Input(doc = "") + val inPC_SD = pca.outPC_SD + + @Input(doc = "") + val inPC_LOADINGS = pca.outPC_LOADINGS + + @Output + val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt") + + var command: String = + xhmmExec + " --normalize" + + " -r " + input + + " --PCAfiles " + pca.PCAbase + + " --normalizeOutput " + normalized + if (PCAnormalizeMethodString != "") + command += " " + PCAnormalizeMethodString + + def commandLine = command + + override def description = "Normalizes mean-centered data using PCA information: " + command + } + + class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Output + val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType sample --zScoreData" + + " -o " + filteredZscored + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + if (targetSampleNormalizedFiltersString != "") + command += " " + targetSampleNormalizedFiltersString + + def commandLine = command + + override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command + } + + class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u)) + + @Input(doc = "") + val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u)) + + @Output + val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + + sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) + + " -o " + sameFiltered + + def commandLine = command + + override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command + } + + class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Output + val xcnv: File = new File(outputBase.getPath + ".xcnv") + + @Output + val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv") + + val posteriorsBase = outputBase.getPath + + @Output + val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt") + + @Output + val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt") + + @Output + val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt") + + var command: String = + xhmmExec + " --discover" + + " -p " + xhmmParams + + " -r " + input + + " -R " + origRD + + " -c " + xcnv + + " -a " + aux_xcnv + + " -s " + posteriorsBase + + " " + discoverCommandLineParams + + def commandLine = command + + override def description = "Discovers CNVs in normalized data: " + command + } + + abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Input(doc = "") + val inXcnv = xcnv + + var command: String = + xhmmExec + " --genotype" + + " -p " + xhmmParams + + " -r " + input + + " -g " + inXcnv + + " -F " + referenceFile + + " -R " + origRD + + " " + genotypeCommandLineParams + } + + class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".vcf") + + command += + " -v " + vcf + + def commandLine = command + + override def description = "Genotypes discovered CNVs in all samples: " + command + } + + class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".subsegments.vcf") + + command += + " -v " + vcf + + " --subsegments" + + " --maxTargetsInSubsegment " + maxTargetsInSubsegment + + " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold + + def commandLine = command + + override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command + } +} diff --git a/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala new file mode 100644 index 000000000..d0c917a9e --- /dev/null +++ b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -0,0 +1,426 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.qscripts + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException +import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction +import org.broadinstitute.sting.queue.function.JavaCommandLineFunction + +class GATKResourcesBundle extends QScript { + // todo -- update to released version when things stabilize + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") + + @Argument(doc="liftOverPerl", required=false) + var liftOverPerl: File = new File("./public/perl/liftOverVCF.pl") + + @Argument(shortName = "ver", doc="The GIT version of this release", required=true) + var BUNDLE_VERSION: String = _ + + @Argument(shortName = "bundleDir", doc="Path to root where resource files will be placed", required=false) + val BUNDLE_ROOT = new File("/humgen/gsa-hpprojects/GATK/bundle") + + @Argument(shortName = "downloadDir", doc="Path to root where resource files will be placed for users to download", required=false) + val DOWNLOAD_ROOT = new File("/humgen/gsa-scr1/pub/bundle") + + @Argument(shortName = "test", doc="", required=false) + val TEST = false + + @Argument(shortName = "phase2", doc="", required=false) + val DO_DOWNLOAD = false + + val SITES_EXT: String = "sites" + + def BUNDLE_DIR: File = BUNDLE_ROOT + "/" + BUNDLE_VERSION + def DOWNLOAD_DIR: File = DOWNLOAD_ROOT + "/" + BUNDLE_VERSION + + // REFERENCES + class Reference( val name: String, val file: File ) { } + var hg19: Reference = _ + var b37: Reference = _ + var hg18: Reference = _ + var b36: Reference = _ + var exampleFASTA: Reference = _ + var refs: List[Reference] = _ + + class Resource(val file: File, val name: String, val ref: Reference, val useName: Boolean = true, val makeSites: Boolean = true, val makeCallsIfBam: Boolean = true ) { + def destname(target: Reference): String = { + if ( useName ) + return name + "." + target.name + "." + getExtension(file) + else + return file.getName + } + } + + def liftover(in: File, inRef: Reference, out: File, outRef: Reference): CommandLineFunction = { + //Console.printf("liftover(%s => %s)%n", inRef.name, outRef.name) + (inRef.name, outRef.name) match { + case ("b37", "hg19") => + return new LiftOverPerl(in, out, new File("public/chainFiles/b37tohg19.chain"), inRef, outRef) + case ("b37", "hg18") => + return new LiftOverPerl(in, out, new File("public/chainFiles/b37tohg18.chain"), inRef, outRef) + case ("b37", "b36") => + return new LiftOverPerl(in, out, new File("public/chainFiles/b37tob36.chain"), inRef, outRef) + case _ => return null + } + } + + def isVCF(file: File) = file.getName.endsWith(".vcf") + def isBAM(file: File) = file.getName.endsWith(".bam") + def isOUT(file: File) = file.getName.endsWith(".out") + def isFASTA(file: File) = file.getName.endsWith(".fasta") + def isIntervalList(file: File) = file.getName.endsWith(".interval_list") + + var RESOURCES: List[Resource] = Nil + def addResource(comp: Resource) { RESOURCES = comp :: RESOURCES } + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.memoryLimit = 2 + } + + def initializeTestDataFiles() = { + // + // Standard evaluation files for indel + // + b37 = new Reference("b37", new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")) + hg18 = new Reference("hg18", new File("/Users/depristo/Desktop/broadLocal/localData/Homo_sapiens_assembly18.fasta")) + exampleFASTA = new Reference("exampleFASTA", new File("public/testdata/exampleFASTA.fasta")) + refs = List(b37, hg18, exampleFASTA) + + val DATAROOT = "/Users/depristo/Desktop/broadLocal/localData/" + //addResource(new Resource(DATAROOT + "human_g1k_v37.fasta", "human_g1k_v37.fasta", b37, false)) + addResource(new Resource(DATAROOT + "1000G.snp.validation.b37.vcf", "1000G.snp.validation", b37)) + addResource(new Resource(DATAROOT + "dbsnp_132_b37.vcf", "dbsnp_132", b37, true, false)) + + addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false)) + addResource(new Resource("public/testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false, false, false)) + } + + def initializeStandardDataFiles() = { + // + // references + // + hg19 = new Reference("hg19", new File("/humgen/gsa-hpprojects/GATK/data/ucsc.hg19/ucsc.hg19.fasta")) + b37 = new Reference("b37", new File("/humgen/1kg/reference/human_g1k_v37.fasta")) + hg18 = new Reference("hg18", new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")) + b36 = new Reference("b36", new File("/humgen/1kg/reference/human_b36_both.fasta")) + exampleFASTA = new Reference("exampleFASTA", new File("public/testdata/exampleFASTA.fasta")) + refs = List(hg19, b37, hg18, b36, exampleFASTA) + + addResource(new Resource(b37.file, "", b37, false)) + addResource(new Resource(b36.file, "", b36, false)) + addResource(new Resource(hg19.file, "", hg19, false)) + addResource(new Resource(hg18.file, "", hg18, false)) + + // + // The b37_decoy reference + // + addResource(new Resource("/humgen/1kg/reference/human_g1k_v37_decoy.fasta", + "IGNORE", b37, false, false)) + + // + // standard VCF files. Will be lifted to each reference + // + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_138_b37.leftAligned.vcf", + "dbsnp_138", b37, true, false)) + + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", + "1000G_omni2.5", b37, true, false)) + + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf", + "hapmap_3.3", b37, true, false)) + + addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", + "1000G_phase1.indels", b37, true, false)) + + addResource(new Resource("/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf", + "1000G_phase1.snps.high_confidence", b37, true, false)) + + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", + "Mills_and_1000G_gold_standard.indels", b37, true, false)) + + // + // CEU trio (NA12878,NA12891,NA12892) best practices results + // + + addResource(new Resource("/humgen/1kg/processing/production_wgs_final/trio/CEU/CEU.wgs.HaplotypeCaller.20131118.snps_indels.high_coverage_pcr_free.genotypes.vcf", + "CEUTrio.HiSeq.WGS.b37.bestPractices",b37,true,false)) + + // + // NA12878 knowledgebase snapshot + // + + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/knowledgeBase/snapshots/NA12878.wgs.broad_truth_set.20131119.snps_and_indels.genotypes.vcf", + "NA12878.knowledgebase.snapshot.20131119",b37,true,false)) + + // + // example call set for documentation guide tutorial + // + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", + "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) + + // + // Test BAM file, only for the b37 reference + // + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.NA12878.bam", + "IGNORE", b37, false, false)) + + // + // Exome targets file, only for the b37 reference + // + addResource(new Resource("/seq/references/HybSelOligos/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", + "Broad.human.exome", b37, true, false, false)) + + // + // refGene files specific to each reference + // + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/refGene_b37.sorted.txt", + "refGene", b37, true, false)) + + addResource(new Resource("public/chainFiles/hg18tob37.chain", "", hg18, false, false)) + addResource(new Resource("public/chainFiles/b36tob37.chain", "", b36, false, false)) + + // todo -- chain files? + // todo 1000G SNP and indel call sets? + + // + // exampleFASTA file + // + addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false)) + addResource(new Resource("public/testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false, false, false)) + } + + def createBundleDirectories(dir: File) = { + if ( ! dir.exists ) dir.mkdirs() + + for ( ref <- refs ) { + val refDir = new File(dir + "/" + ref.name) + if ( ! refDir.exists ) refDir.mkdirs() + } + } + + def createCurrentLink(bundleDir: File) = { + + val currentLink = new File(BUNDLE_ROOT + "/current") + + if ( currentLink.exists ) add(new deleteLink(currentLink)) + + add(new linkFile(bundleDir, currentLink)) + } + + def script = { + if ( TEST ) + initializeTestDataFiles(); + else + initializeStandardDataFiles(); + + if ( ! DO_DOWNLOAD ) { + // create destination directory structure + createBundleDirectories(BUNDLE_DIR) + + for ( resource: Resource <- RESOURCES ) { + if ( isFASTA(resource.file) ) { + copyBundleFasta(resource, resource.ref) + } else if ( isBAM(resource.file) ) { + val f = copyBundleFile(resource, resource.ref) + add(new IndexBAM(f)) + if ( resource.makeCallsIfBam ) { + @Output val outvcf: File = swapExt(f.getParent, f, ".bam", ".vcf") + add(new UG(resource.file, resource.ref.file, outvcf)) + } + } else if ( isVCF(resource.file) ) { + for ( destRef <- refs ) { + val out = destFile(BUNDLE_DIR, destRef, resource.destname(destRef)) + var continue = true + + // copy or lift over the original vcf + if ( resource.ref == destRef ) { + add(new cpFile(resource.file, out)) + } else { + val clf = liftover(resource.file, resource.ref, out, destRef) + if ( clf != null ) { + add(clf) + } else { + continue = false + } + } + + if ( continue ) { + add(new IndexVCF(out, destRef.file)) + + if ( resource.makeSites ) { + val sites: Resource = new Resource(swapExt(out.getParent, out, ".vcf", "." + SITES_EXT + ".vcf"), "", destRef, false) + add(new JustSites(out, sites.file)) + add(new IndexVCF(sites.file, destRef.file)) + } + + if ( resource.name.contains("dbsnp") ) { + val dbsnp129: Resource = new Resource(swapExt(out.getParent, out, ".vcf", ".excluding_sites_after_129.vcf"), "", destRef, false) + add(new MakeDBSNP129(out, destRef.file, dbsnp129.file)) + add(new IndexVCF(dbsnp129.file, destRef.file)) + } + } + } + } else if ( isIntervalList(resource.file) ) { + val out = destFile(BUNDLE_DIR, resource.ref, resource.destname(resource.ref)) + add(new cpFile(resource.file, out)) + } else { + //throw new ReviewedStingException("Unknown file type: " + resource) + } + } + + } else { + createCurrentLink(BUNDLE_DIR) + createBundleDirectories(DOWNLOAD_DIR) + createDownloadsFromBundle(BUNDLE_DIR, DOWNLOAD_DIR) + } + } + + + def createDownloadsFromBundle(in: File, out: File) { + Console.printf("Visiting %s%n", in) + if (! in.getName.startsWith(".")) { + if ( in.isDirectory ) { + out.mkdirs + + for ( child: File <- in.listFiles ) { + createDownloadsFromBundle(child, out + "/" + child.getName) + } + } else { + if ( isBAM(in) ) { + add(new cpFile(in, out)) + add(new md5sum(out)) + } else if ( !isOUT(in) ) { + add(new GzipFile(in, out + ".gz")) + add(new md5sum(out + ".gz")) + } + + } + } + } + + def copyBundleFasta(res: Resource, ref: Reference) { + val out = destFile(BUNDLE_DIR, ref, res.destname(ref)) + add(new cpFile(res.file, out)) + + val oldRefDict = swapExt(res.file.getParent, res.file, ".fasta", ".dict") + val newRefDict = swapExt(out.getParent, out, ".fasta", ".dict") + + val oldRefFai = swapExt(res.file.getParent, res.file, ".fasta", ".fasta.fai") + val newRefFai = swapExt(out.getParent, out, ".fasta", ".fasta.fai") + + add(new cpFile(oldRefDict, newRefDict)) + add(new cpFile(oldRefFai, newRefFai)) + } + + def copyBundleFile(res: Resource, ref: Reference): File = { + val out = destFile(BUNDLE_DIR, ref, res.destname(ref)) + add(new cpFile(res.file, out)) + return out + } + + def destFile(dir: File, ref: Reference, f: File): File = { + return destFile(dir, ref, f.getName) + } + + def destFile(dir: File, ref: Reference, name: String): File = { + return new File(dir + "/" + ref.name + "/" + name) + } + + /** + * A command line (cut) that removes all genotyping information from a file + */ + class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { + def commandLine = "cut -f 1-8 %s > %s".format(in, out) + } + + class GzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { + def commandLine = "gzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath) + } + + class cpFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { + def commandLine = "cp %s %s".format(in.getAbsolutePath, out.getAbsolutePath) + } + + class deleteLink(@Input val in: File) extends CommandLineFunction { + def commandLine = "rm %s".format(in.getAbsolutePath) + } + + class linkFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { + def commandLine = "ln -s %s %s".format(in.getAbsolutePath, out.getAbsolutePath) + } + + class md5sum(@Input val in: File) extends CommandLineFunction { + @Output val o: File = new File(in.getAbsolutePath + ".md5") + def commandLine = "md5sum %s > %s".format(in.getAbsolutePath, o) + } + + class IndexBAM(bamIn: File) extends SamtoolsIndexFunction { + bamFile = bamIn + } + + class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRODs with UNIVERSAL_GATK_ARGS { + //@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx") + this.rod :+= vcf + this.reference_sequence = ref + } + + class UG(@Input bam: File, @Input ref: File, @Input outVCF: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { + this.input_file = List(bam) + this.reference_sequence = ref + this.intervalsString ++= List("20"); + this.out = outVCF + } + + class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { + this.variant = dbsnp + this.select ++= List("dbSNPBuildID <= 129") + this.reference_sequence = ref + this.out = dbsnp129 + } + + class LiftOverPerl(@Input val in: File, @Output val out: File, @Input val chain: File, oldRef: Reference, newRef: Reference) extends CommandLineFunction { + this.memoryLimit = 12 + def commandLine = ("%s -vcf %s -chain %s -out %s " + + "-gatk ./ -newRef %s -oldRef %s -tmp %s").format(liftOverPerl, in.getAbsolutePath, chain, + out.getAbsolutePath, newRef.file.replace(".fasta", ""), + oldRef.file.replace(".fasta", ""), jobTempDir) + } + + def getExtension(f: File): String = { + val i = f.getName.lastIndexOf('.'); + if (i > 0 && i < f.getName.length() - 1) + return f.getName.substring(i+1).toLowerCase(); + else + return ""; + } +} + diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala diff --git a/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExamplePrintReads.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExamplePrintReads.scala new file mode 100644 index 000000000..18f2895b9 --- /dev/null +++ b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExamplePrintReads.scala @@ -0,0 +1,53 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.qscripts.examples + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +/** + * Script used for testing output to /dev/null, deleting .bai files, etc. + */ +class ExamplePrintReads extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + @Output(doc="Bam output", shortName="out") + var outFile: File = _ + + def script() { + val printReads = new PrintReads + printReads.reference_sequence = referenceFile + printReads.memoryLimit = 2 + printReads.scatterCount = 3 + printReads.input_file :+= bamFile + printReads.out = outFile + add(printReads) + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/queue-framework/src/main/resources/org/broadinstitute/sting/queue/util/queueJobReport.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R rename to public/queue-framework/src/main/resources/org/broadinstitute/sting/queue/util/queueJobReport.R diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandLine.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandLine.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandPlugin.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandPlugin.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QException.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QException.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QException.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QException.scala diff --git a/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScript.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScript.scala new file mode 100644 index 000000000..afd1bbc19 --- /dev/null +++ b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScript.scala @@ -0,0 +1,184 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue + +import engine.JobRunInfo +import org.broadinstitute.sting.queue.function.QFunction +import annotation.target.field +import util._ +import org.broadinstitute.sting.commandline.ArgumentSource + +/** + * Defines a Queue pipeline as a collection of CommandLineFunctions. + */ +trait QScript extends Logging with PrimitiveOptionConversions with StringFileConversions { + + // Type aliases so users don't have to import + type File = java.io.File + type CommandLineFunction = org.broadinstitute.sting.queue.function.CommandLineFunction + type InProcessFunction = org.broadinstitute.sting.queue.function.InProcessFunction + type ScatterGatherableFunction = org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction + type SimpleTextGatherFunction = org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction + + // Make sure annotations can be used in class constructors but target the fields + // ex: class MyClass(@Input var myVar: File) {} + // This was implicitly enabled in 2.8.0-RC2 and then updated to this new syntax: + // http://lampsvn.epfl.ch/trac/scala/ticket/3596 + // http://lampsvn.epfl.ch/trac/scala/ticket/3421 + type Input = org.broadinstitute.sting.commandline.Input @field + type Output = org.broadinstitute.sting.commandline.Output @field + type Argument = org.broadinstitute.sting.commandline.Argument @field + type ArgumentCollection = org.broadinstitute.sting.commandline.ArgumentCollection @field + type Gather = org.broadinstitute.sting.commandline.Gather @field + + /** + * Default settings for QFunctions + */ + var qSettings: QSettings = _ + + /** + * Builds the CommandLineFunctions that will be used to run this script and adds them to this.functions directly or using the add() utility method. + */ + def script() + + /** + * A default handler for the onExecutionDone() function. By default this doesn't do anything + */ + def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) { + } + + /** + * The command line functions that will be executed for this QScript. + */ + var functions = Seq.empty[QFunction] + + /** + * Exchanges the extension on a file. + * @param file File to look for the extension. + * @param oldExtension Old extension to strip off, if present. + * @param newExtension New extension to append. + * @return new File with the new extension in the current directory. + */ + protected def swapExt(file: File, oldExtension: String, newExtension: String) = + new File(file.getName.stripSuffix(oldExtension) + newExtension) + + /** + * Exchanges the extension on a file. + * @param dir New directory for the file. + * @param file File to look for the extension. + * @param oldExtension Old extension to strip off, if present. + * @param newExtension New extension to append. + * @return new File with the new extension in dir. + */ + protected def swapExt(dir: File, file: File, oldExtension: String, newExtension: String) = + new File(dir, file.getName.stripSuffix(oldExtension) + newExtension) + + /** + * Adds one or more command line functions to be run. + * @param functions Functions to add. + */ + def add(functions: QFunction*) { + functions.foreach(function => function.addOrder = QScript.nextAddOrder) + this.functions ++= functions + } + + def addAll(functions: Traversable[QFunction]) { + functions.foreach( f => add(f) ) + } + + /** + * Convert all @Output files to remote output files. + * @param remoteFileConverter Converter for files to remote files. + */ + def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) { + for (field <- outputFields) { + val fieldFile = ClassFieldCache.getFieldFile(this, field) + if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) { + val fieldName = ClassFieldCache.fullName(field) + val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName) + ClassFieldCache.setFieldValue(this, field, remoteFile) + } + } + } + + /** + * Pull all remote files to the local disk + */ + def pullInputs() { + val inputs = ClassFieldCache.getFieldFiles(this, inputFields) + for (remoteFile <- filterRemoteFiles(inputs)) { + logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) + remoteFile.pullToLocal() + } + } + + /** + * Push all remote files from the local disk + */ + def pushOutputs() { + val outputs = ClassFieldCache.getFieldFiles(this, outputFields) + for (remoteFile <- filterRemoteFiles(outputs)) { + logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) + remoteFile.pushToRemote() + } + } + + private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] = + fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]) + /** + * @return the inputs or null if there are no inputs + */ + def remoteInputs: AnyRef = null + + /** + * @return the outputs or null if there are no outputs + */ + def remoteOutputs: AnyRef = null + + /** The complete list of fields. */ + def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass) + /** The @Input fields. */ + def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.getClass) + /** The @Output fields. */ + def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.getClass) + /** The @Argument fields. */ + def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.getClass) +} + +object QScript { + private var addOrder = 0 + private def nextAddOrder = { + addOrder += 1 + Seq(addOrder) + } + + /** + * Resets the add order back to zero. Useful for testing purposes. + */ + def resetAddOrder() { + addOrder = 0 + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScriptManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScriptManager.scala diff --git a/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QSettings.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QSettings.scala new file mode 100644 index 000000000..197d45e0a --- /dev/null +++ b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QSettings.scala @@ -0,0 +1,101 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue + +import java.io.File +import org.broadinstitute.sting.commandline.{ClassType, Argument} + +/** + * Default settings settable on the command line and passed to CommandLineFunctions. + */ +class QSettings { + @Argument(fullName="run_name", shortName="runName", doc="A name for this run used for various status messages.", required=false) + var runName: String = _ + + @Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false) + var jobProject: String = _ + + @Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false) + var jobQueue: String = _ + + @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) + @ClassType(classOf[Int]) + var jobPriority: Option[Int] = None + + @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) + var jobNativeArgs: Seq[String] = Nil + + @Argument(fullName="job_resource_request", shortName="jobResReq", doc="Resource requests to pass to the job runner.", required=false) + var jobResourceRequests: Seq[String] = Nil + + @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) + var jobEnvironmentNames: Seq[String] = Nil + + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + @ClassType(classOf[Double]) + var memoryLimit: Option[Double] = Some(2) + + @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) + var memoryLimitThreshold: Option[Double] = None + + @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) + var residentLimit: Option[Double] = None + + @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) + var residentRequest: Option[Double] = None + + @Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false) + var residentRequestParameter: String = _ + + @Argument(fullName="job_walltime", shortName="wallTime", doc="Setting the required DRMAA walltime or LSF run limit.", required=false) + @ClassType(classOf[Long]) + var jobWalltime: Option[Long] = None + + /** The name of the parallel environment (required for SGE, for example) */ + @Argument(fullName="job_parallel_env", shortName="jobParaEnv", doc="An SGE style parallel environment to use for jobs requesting more than 1 core. Equivalent to submitting jobs with -pe ARG nt for jobs with nt > 1", required=false) + var parallelEnvironmentName: String = "smp_pe" // Broad default + + @Argument(fullName="dontRequestMultipleCores", shortName="multiCoreJerk", doc="If provided, Queue will not request multiple processors for jobs using multiple processors. Sometimes you eat the bear, sometimes the bear eats you.", required=false) + var dontRequestMultipleCores: Boolean = false + + @Argument(fullName="disableDefaultJavaGCOptimizations", shortName="noGCOpt", doc="If provided, Queue will not ensure that java GC threads are limited and that the a minimum amount of time is spent in GC.") + var disableDefaultJavaGCOptimizations = false + + @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) + var runDirectory = new File(".") + + @Argument(fullName="temp_directory", shortName="tempDir", doc="Temp directory to pass to functions.", required=false) + var tempDirectory = new File(System.getProperty("java.io.tmpdir")) + + @Argument(fullName="job_scatter_gather_directory", shortName="jobSGDir", doc="Default directory to place scatter gather output for compute farm jobs.", required=false) + var jobScatterGatherDirectory: File = _ + + @Argument(fullName="log_directory", shortName="logDir", doc="Directory to write log files into.", required=false) + var logDirectory: File = _ +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/FunctionEdge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/FunctionEdge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunInfo.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunInfo.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/MappingEdge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/MappingEdge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QEdge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QEdge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraph.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraph.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraphSettings.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraphSettings.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QNode.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QNode.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/RunnerStatus.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/RunnerStatus.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala diff --git a/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala new file mode 100644 index 000000000..f580ba116 --- /dev/null +++ b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -0,0 +1,68 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.extensions.gatk + +import org.broadinstitute.sting.queue.function.scattergather.GatherFunction +import org.broadinstitute.sting.queue.extensions.picard.MergeSamFiles +import org.broadinstitute.sting.queue.function.RetryMemoryLimit +import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor +import org.broadinstitute.sting.queue.util.ClassFieldCache +import java.io.File + +/** + * Merges BAM files using net.sf.picard.sam.MergeSamFiles. + */ +class BamGatherFunction extends MergeSamFiles with GatherFunction with RetryMemoryLimit { + this.assumeSorted = Some(true) + + override def freezeFieldValues() { + this.input = this.gatherParts + this.output = this.originalOutput + //Left to its own devices (ie, MergeSamFiles.freezeFieldValues), outputIndex + //will be in the gather directory. Ensure that it actually matches this.output + if (output != null) + outputIndex = new File(output.getParentFile, output.getName.stripSuffix(".bam") + ".bai") + + val originalGATK = originalFunction.asInstanceOf[CommandLineGATK] + + // Whatever the original function can handle, merging *should* do less. + this.memoryLimit = originalFunction.memoryLimit + + // bam_compression and index_output_bam_on_the_fly from SAMFileWriterArgumentTypeDescriptor + // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK + + val compression = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.COMPRESSION_FULLNAME) + this.compressionLevel = originalGATK.getFieldValue(compression).asInstanceOf[Option[Int]] + + val disableIndex = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME) + this.createIndex = Some(!originalGATK.getFieldValue(disableIndex).asInstanceOf[Boolean]) + + val enableMD5 = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME) + this.createMD5 = Some(originalGATK.getFieldValue(enableMD5).asInstanceOf[Boolean]) + + super.freezeFieldValues() + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/CommandLineFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/CommandLineFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/InProcessFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/InProcessFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/ListWriterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/ListWriterFunction.scala diff --git a/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/QFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/QFunction.scala new file mode 100644 index 000000000..9208c04f7 --- /dev/null +++ b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/QFunction.scala @@ -0,0 +1,517 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.function + +import java.io.File +import java.lang.annotation.Annotation +import org.broadinstitute.sting.commandline._ +import org.broadinstitute.sting.queue.{QException, QSettings} +import java.lang.IllegalStateException +import org.broadinstitute.sting.queue.util._ +import org.broadinstitute.sting.utils.io.IOUtils +import scala.language.reflectiveCalls + +/** + * The base interface for all functions in Queue. + * Inputs and outputs are specified as Sets of values. + * Inputs are matched to other outputs by using .equals() + */ +trait QFunction extends Logging with QJobReport { + /** + * A short description of what this class of function does. + * By default does not include the output specific to this function. + * See shortDescription for a description of what this instance of the function outputs. + */ + var analysisName: String = "" + + /** + * The name name of the job, must be file system safe and unique to the graph. + * Defaults to "runName-". + * Use shortDescription for an alternative that is display friendly. + */ + var jobName: String = _ + + /** Default settings */ + var qSettings: QSettings = _ + + /** Directory to run the command in. */ + var commandDirectory: File = new File(".") + + /** Temporary directory to write any files. Must be network accessible. */ + var jobTempDir: File = null + + /** + * Local path available on all machines to store LOCAL temporary files. Not an @Input, + * nor an @Output. Currently only used for local intermediate files for composite jobs. + * Needs to be an annotated field so that it's mutated during cloning. + */ + @Argument(doc="Local path available on all machines to store LOCAL temporary files.") + var jobLocalDir: File = _ + + /** Order the function was added to the graph. */ + var addOrder: Seq[Int] = Nil + + /** Job priority */ + var jobPriority: Option[Int] = None + + /** Whether a job is restartable */ + var jobRestartable = true + + /** + * A callback for modifying the run. + * NOTE: This function is for ADVANCED use only and is unsupported. + */ + var updateJobRun: PartialFunction[Any,Unit] = null + + /** + * If true, unless another unfinished function is dependent on this function, + * this function will NOT be run even if the outputs have not been created. + */ + var isIntermediate = false + + // ------------------------------------------------------- + // + // job run information + // + // ------------------------------------------------------- + + /** + * Copies settings from this function to another function. + * @param function QFunction to copy values to. + */ + override def copySettingsTo(function: QFunction) { + function.qSettings = this.qSettings + function.commandDirectory = this.commandDirectory + function.jobTempDir = this.jobTempDir + function.jobLocalDir = this.jobLocalDir + function.addOrder = this.addOrder + function.jobPriority = this.jobPriority + function.jobRestartable = this.jobRestartable + function.updateJobRun = this.updateJobRun + function.isIntermediate = this.isIntermediate + function.reportGroup = this.reportGroup + function.reportFeatures = this.reportFeatures + } + + /** File to redirect any output. Defaults to .out */ + var jobOutputFile: File = _ + + /** File to redirect any errors. Defaults to .out */ + var jobErrorFile: File = _ + + /** Errors (if any) from the last failed run of jobErrorFiles. */ + @Argument(doc="Job error lines", required=false) + var jobErrorLines: Seq[String] = Nil + + /** + * The number of times this function has previously been run. + */ + @Argument(doc="Job retries", required=false) + var retries = 0 + + /** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */ + def setupRetry() { + } + + /** + * Description of this command line function. + */ + def description: String = "%s: %s > %s".format(analysisName, inputs, outputs) + + /** + * A short description of the function. + */ + def shortDescription = { + firstOutput match { + case file: File => analysisName + ": " + file.getName + case _ => analysisName + } + } + + /** + * The name of the job as submitted to the job runner + */ + def jobRunnerJobName = shortDescription + + /** + * Returns true if the function is done. + */ + def isDone: Boolean = { + val files = doneOutputs + if (files.size == 0) + throw new IllegalStateException("Function should have at least one output: " + analysisName) + files.forall(_.exists) + } + + /** + * Returns true if the function has failed. + */ + def isFail: Boolean = { + val files = failOutputs + if (files.size == 0) + throw new IllegalStateException("Function should have at least one output: " + analysisName) + files.exists(_.exists) + } + + /** + * Returns files to track for hidden done/fail files. + * @return Seq[String] files. + */ + protected def statusPaths = { + var paths = outputs + paths :+= jobOutputFile + if (jobErrorFile != null) + paths :+= jobErrorFile + paths + } + + /** + * Returns prefixes for hidden done/fail files. + * @return prefixes. + */ + private def statusPrefixes = statusPaths. + filter(file => !IOUtils.isSpecialFile(file)). + map(file => file.getParentFile + "/." + file.getName) + + /** + * Returns the output files for this function. + * @return outputs for this function. + */ + def doneOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".done")) + + /** + * Returns the output files for this function. + * @return outputs for this function. + */ + def failOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".fail")) + + /** The complete list of fields on this CommandLineFunction. */ + def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.functionFieldClass) + /** The @Input fields on this CommandLineFunction. */ + def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.functionFieldClass) + /** The @Output fields on this CommandLineFunction. */ + def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.functionFieldClass) + /** The @Argument fields on this CommandLineFunction. */ + def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.functionFieldClass) + + /** + * Returns the class that should be used for looking up fields. + */ + protected def functionFieldClass = this.getClass + + /** + * Returns the input files for this function. + * @return inputs for this function. + */ + def inputs: Seq[File] = getFieldFiles(inputFields) + + /** + * Returns the output files for this function. + * @return outputs for this function. + */ + def outputs: Seq[File] = getFieldFiles(outputFields) + + /** + * Returns the first output file. + * @return first output for this function. + */ + def firstOutput: File = outputs.headOption.getOrElse(null) + + /** + * Returns the set of directories where files may be written. + */ + def outputDirectories = { + var dirs = Set.empty[File] + dirs += commandDirectory + dirs += jobTempDir + dirs += jobLocalDir + dirs += jobOutputFile.getParentFile + if (jobErrorFile != null) + dirs += jobErrorFile.getParentFile + dirs ++= outputs.map(_.getParentFile) + dirs + } + + /** + * Deletes the log files for this function. + */ + def deleteLogs() = { + IOUtils.tryDelete(jobOutputFile) + if (jobErrorFile != null) + IOUtils.tryDelete(jobErrorFile) + } + + /** + * Deletes the output files and all the status files for this function. + */ + def deleteOutputs() { + outputs.filter(file => !IOUtils.isSpecialFile(file)).foreach(file => IOUtils.tryDelete(file)) + doneOutputs.foreach(file => IOUtils.tryDelete(file)) + failOutputs.foreach(file => IOUtils.tryDelete(file)) + } + + /** + * Creates the output directories for this function if it doesn't exist. + */ + def mkOutputDirectories() { + outputDirectories.foreach(dir => { + if (!dir.exists && !dir.mkdirs) + throw new QException("Unable to create directory: " + dir) + }) + } + + /** + * Returns fields that do not have values which are required. + * @return Seq[String] names of fields missing values. + */ + def missingFields: Seq[String] = { + val missingInputs = missingFields(inputFields, classOf[Input]) + val missingOutputs = missingFields(outputFields, classOf[Output]) + val missingArguments = missingFields(argumentFields, classOf[Argument]) + (missingInputs ++ missingOutputs ++ missingArguments).distinct.sorted + } + + /** + * Returns fields that do not have values which are required. + * @param sources Fields to check. + * @param annotation Annotation. + * @return names of fields missing values. + */ + private def missingFields(sources: Seq[ArgumentSource], annotation: Class[_ <: Annotation]): Seq[String] = { + var missing: Seq[String] = Nil + for (source <- sources) { + if (isRequired(source, annotation)) + if (!hasFieldValue(source)) + if (!exclusiveOf(source, annotation).exists(otherSource => hasFieldValue(otherSource))) + missing :+= "@%s: %s - %s".format(annotation.getSimpleName, source.field.getName, doc(source, annotation)) + } + missing + } + + /** + * Gets the files from the fields. The fields must be a File, a FileExtension, or a Seq or Set of either. + * @param fields Fields to get files. + * @return for the fields. + */ + private def getFieldFiles(fields: Seq[ArgumentSource]): Seq[File] = { + var files: Seq[File] = Nil + for (field <- fields) + files ++= getFieldFiles(field) + files.distinct + } + + /** + * Gets the files from the field. The field must be a File, a FileExtension, or a Seq or Set of either. + * @param field Field to get files. + * @return for the field. + */ + def getFieldFiles(field: ArgumentSource): Seq[File] = { + var files: Seq[File] = Nil + CollectionUtils.foreach(getFieldValue(field), (fieldValue) => { + val file = fieldValueToFile(field, fieldValue) + if (file != null) + files :+= file + }) + files.distinct + } + + /** + * Gets the file from the field. The field must be a File or a FileExtension and not a Seq or Set. + * @param field Field to get the file. + * @return for the field. + */ + def getFieldFile(field: ArgumentSource): File = + fieldValueToFile(field, getFieldValue(field)) + + /** + * Converts the field value to a file. The field must be a File or a FileExtension. + * @param field Field to get the file. + * @param value Value of the File or FileExtension or null. + * @return Null if value is null, otherwise the File. + * @throws QException if the value is not a File or FileExtension. + */ + private def fieldValueToFile(field: ArgumentSource, value: Any): File = value match { + case file: File => file + case null => null + case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown)) + } + + /** + * After a function is frozen no more updates are allowed by the user. + * The function is allow to make necessary updates internally to make sure + * the inputs and outputs will be equal to other inputs and outputs. + */ + final def freeze() { + freezeFieldValues() + canonFieldValues() + } + + /** + * Sets all field values. + */ + def freezeFieldValues() { + if (jobName == null) + jobName = qSettings.runName + "-" + this.addOrder.mkString("-") + + if (jobOutputFile == null) { + /*If the outputFile has been set to an absolute path, respect that. + Otherwise, place it in (possibly a subdirectory of) the log directory + The relative case is first as it's arguably the most common condition + */ + jobOutputFile = firstOutput match { + case file: File if !IOUtils.isSpecialFile(file) && !file.isAbsolute => + val logDir : File = if (file.getParentFile == null) qSettings.logDirectory else new File(qSettings.logDirectory, file.getParent) + new File(logDir, file.getName + ".out") + + case file: File if !IOUtils.isSpecialFile(file) && file.isAbsolute => + new File(file.getParentFile, file.getName + ".out") + + case _ => + new File(qSettings.logDirectory, jobName + ".out") + } + } + + if (jobTempDir == null) + jobTempDir = qSettings.tempDirectory + + if (jobLocalDir == null) + jobLocalDir = jobTempDir + + if (jobPriority.isEmpty) + jobPriority = qSettings.jobPriority + + // Do not set the temp and local dir relative to the command directory + jobTempDir = IOUtils.absolute(jobTempDir) + jobLocalDir = IOUtils.absolute(jobLocalDir) + + absoluteCommandDirectory() + } + + /** + * If the command directory is relative, insert the run directory ahead of it. + */ + def absoluteCommandDirectory() { + commandDirectory = IOUtils.absolute(qSettings.runDirectory, commandDirectory) + } + + /** + * Makes all field values canonical so that the graph can match the + * inputs of one function to the output of another using equals(). + */ + def canonFieldValues() { + for (field <- this.functionFields) { + var fieldValue = this.getFieldValue(field) + fieldValue = CollectionUtils.updated(fieldValue, canon).asInstanceOf[AnyRef] + this.setFieldValue(field, fieldValue) + } + + this.jobOutputFile = canon(this.jobOutputFile).asInstanceOf[File] + if (this.jobErrorFile != null) + this.jobErrorFile = canon(this.jobErrorFile).asInstanceOf[File] + } + + /** + * Set value to a uniform value across functions. + * Base implementation changes any relative path to an absolute path. + * @param value to be updated + * @return the modified value, or a copy if the value is immutable + */ + protected def canon(value: Any) = { + value match { + case file: File => IOUtils.absolute(commandDirectory, file) + case x => x + } + } + + /** + * Scala sugar type for checking annotation required and exclusiveOf. + */ + private type ArgumentAnnotation = { + def required(): Boolean + def exclusiveOf(): String + def doc(): String + } + + /** + * Returns the isRequired value from the field. + * @param field Field to check. + * @param annotation Annotation. + * @return the isRequired value from the field annotation. + */ + private def isRequired(field: ArgumentSource, annotation: Class[_ <: Annotation]) = + ReflectionUtils.getAnnotation(field.field, annotation).asInstanceOf[ArgumentAnnotation].required() + + /** + * Returns an array of ArgumentSources from functionFields listed in the exclusiveOf of the original field + * @param field Field to check. + * @param annotation Annotation. + * @return the Array[ArgumentSource] that may be set instead of the field. + */ + private def exclusiveOf(field: ArgumentSource, annotation: Class[_ <: Annotation]) = + ReflectionUtils.getAnnotation(field.field, annotation).asInstanceOf[ArgumentAnnotation].exclusiveOf() + .split(",").map(_.trim).filter(_.length > 0) + .map(fieldName => functionFields.find(fieldName == _.field.getName) match { + case Some(x) => x + case None => throw new QException("Unable to find exclusion field %s on %s".format(fieldName, this.getClass.getSimpleName)) + }) + + /** + * Returns the doc value from the field. + * @param field Field to check. + * @param annotation Annotation. + * @return the doc value from the field annotation. + */ + private def doc(field: ArgumentSource, annotation: Class[_ <: Annotation]) = + ReflectionUtils.getAnnotation(field.field, annotation).asInstanceOf[ArgumentAnnotation].doc() + + /** + * Returns true if the field has a value. + * @param source Field to check for a value. + * @return true if the field has a value. + */ + protected def hasFieldValue(source: ArgumentSource) = this.hasValue(this.getFieldValue(source)) + + /** + * Returns false if the value is null or an empty collection. + * @param param Value to test for null, or a collection to test if it is empty. + * @return false if the value is null, or false if the collection is empty, otherwise true. + */ + protected def hasValue(param: Any) = CollectionUtils.isNotNullOrNotEmpty(param) + + /** + * Gets the value of a field. + * @param source Field to get the value for. + * @return value of the field. + */ + def getFieldValue(source: ArgumentSource) = ClassFieldCache.getFieldValue(this, source) + + /** + * Gets the value of a field. + * @param source Field to set the value for. + * @return value of the field. + */ + def setFieldValue(source: ArgumentSource, value: Any) = ClassFieldCache.setFieldValue(this, source, value) +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ClassFieldCache.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ClassFieldCache.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/CollectionUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/CollectionUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/DoC/package.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/DoC/package.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailMessage.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailMessage.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailSettings.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailSettings.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/Logging.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Logging.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/Logging.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Logging.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobReport.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobReport.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobsReporter.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobsReporter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QScriptUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QScriptUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ReflectionUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ReflectionUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFile.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFile.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Retry.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Retry.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RetryException.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RetryException.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ShellUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ShellUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/StringFileConversions.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/StringFileConversions.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/SystemUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/SystemUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/TextFormatUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/TextFormatUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala diff --git a/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala new file mode 100644 index 000000000..2800ba2da --- /dev/null +++ b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -0,0 +1,259 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.pipeline + +import org.broadinstitute.sting.utils.Utils +import org.testng.Assert +import org.broadinstitute.sting.commandline.CommandLineProgram +import java.util.Date +import java.text.SimpleDateFormat +import org.broadinstitute.sting.BaseTest +import org.broadinstitute.sting.MD5DB +import org.broadinstitute.sting.queue.{QScript, QCommandLine} +import org.broadinstitute.sting.queue.util.Logging +import java.io.{FilenameFilter, File} +import org.broadinstitute.sting.gatk.report.GATKReport +import org.apache.commons.io.FileUtils +import org.apache.commons.io.filefilter.WildcardFileFilter + +object PipelineTest extends BaseTest with Logging { + + private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/" + private val md5DB = new MD5DB() + + /** + * All the job runners configured to run PipelineTests at The Broad. + */ + final val allJobRunners = Seq("Lsf706", "GridEngine", "Shell") + + /** + * The default job runners to run. + */ + final val defaultJobRunners = Seq("Lsf706", "GridEngine") + + /** + * Returns the top level output path to this test. + * @param testName The name of the test passed to PipelineTest.executeTest() + * @param jobRunner The name of the job manager to run the jobs. + * @return the top level output path to this test. + */ + def testDir(testName: String, jobRunner: String) = "pipelinetests/%s/%s/".format(testName, jobRunner) + + /** + * Returns the directory where relative output files will be written for this test. + * @param testName The name of the test passed to PipelineTest.executeTest() + * @param jobRunner The name of the job manager to run the jobs. + * @return the directory where relative output files will be written for this test. + */ + private def runDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "run/" + + /** + * Returns the directory where temp files will be written for this test. + * @param testName The name of the test passed to PipelineTest.executeTest() + * @param jobRunner The name of the job manager to run the jobs. + * @return the directory where temp files will be written for this test. + */ + private def tempDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "temp/" + + /** + * Runs the pipelineTest. + * @param pipelineTest test to run. + */ + def executeTest(pipelineTest: PipelineTestSpec) { + var jobRunners = pipelineTest.jobRunners + if (jobRunners == null) + jobRunners = defaultJobRunners + jobRunners.foreach(executeTest(pipelineTest, _)) + } + + /** + * Runs the pipelineTest. + * @param pipelineTest test to run. + * @param jobRunner The name of the job manager to run the jobs. + */ + def executeTest(pipelineTest: PipelineTestSpec, jobRunner: String) { + // Reset the order of functions added to the graph. + QScript.resetAddOrder() + + val name = pipelineTest.name + if (name == null) + Assert.fail("PipelineTestSpec.name is null") + println(Utils.dupString('-', 80)) + executeTest(name, pipelineTest.args, pipelineTest.jobQueue, pipelineTest.expectedException, jobRunner) + if (BaseTest.pipelineTestRunModeIsSet) { + assertMatchingMD5s(name, pipelineTest.fileMD5s.map{case (file, md5) => new File(runDir(name, jobRunner), file) -> md5}, pipelineTest.parameterize) + if (pipelineTest.evalSpec != null) + validateEval(name, pipelineTest.evalSpec, jobRunner) + for (path <- pipelineTest.expectedFilePaths) + assertPathExists(runDir(name, jobRunner), path) + for (path <- pipelineTest.unexpectedFilePaths) + assertPathDoesNotExist(runDir(name, jobRunner), path) + println(" => %s PASSED (%s)".format(name, jobRunner)) + } + else + println(" => %s PASSED DRY RUN (%s)".format(name, jobRunner)) + } + + private def assertMatchingMD5s(name: String, fileMD5s: Traversable[(File, String)], parameterize: Boolean) { + var failed = 0 + for ((file, expectedMD5) <- fileMD5s) { + val calculatedMD5 = md5DB.testFileMD5(name, "", file, expectedMD5, parameterize).actualMD5 + if (!parameterize && expectedMD5 != "" && expectedMD5 != calculatedMD5) + failed += 1 + } + if (failed > 0) + Assert.fail("%d of %d MD5s did not match".format(failed, fileMD5s.size)) + } + + private def validateEval(name: String, evalSpec: PipelineTestEvalSpec, jobRunner: String) { + // write the report to the shared validation data location + val formatter = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss") + val reportLocation = "%s%s/%s/validation.%s.eval".format(validationReportsDataLocation, jobRunner, name, formatter.format(new Date)) + val reportFile = new File(reportLocation) + + FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), reportFile) + + val report = new GATKReport(reportFile) + + var allInRange = true + + println() + println(name + " validation values:") + println(" value (min,target,max) table key metric") + for (validation <- evalSpec.validations) { + val table = report.getTable(validation.table) + val key = table.findRowByData(validation.table +: validation.key.split('.') : _*) + val value = String.valueOf(table.get(key, validation.metric)) + val inRange = if (value == null) false else validation.inRange(value) + val flag = if (!inRange) "*" else " " + println(" %s %s (%s,%s,%s) %s %s %s".format(flag, value, validation.min, validation.target, validation.max, validation.table, validation.key, validation.metric)) + allInRange &= inRange + } + + if (!allInRange) + Assert.fail("Eval outside of expected range") + } + + /** + * execute the test + * @param name the name of the test + * @param args the argument list + * @param jobQueue the queue to run the job on. Defaults to hour if jobQueue is null. + * @param expectedException the expected exception or null if no exception is expected. + * @param jobRunner The name of the job manager to run the jobs. + */ + private def executeTest(name: String, args: String, jobQueue: String, expectedException: Class[_], jobRunner: String) { + var command = Utils.escapeExpressions(args) + + // add the logging level to each of the integration test commands + + command = Utils.appendArray(command, "-jobRunner", jobRunner, + "-tempDir", tempDir(name, jobRunner), "-runDir", runDir(name, jobRunner)) + + if (jobQueue != null) + command = Utils.appendArray(command, "-jobQueue", jobQueue) + + if (BaseTest.pipelineTestRunModeIsSet) + command = Utils.appendArray(command, "-run") + + // run the executable + var gotAnException = false + + val instance = new QCommandLine + runningCommandLines += instance + try { + println("Executing test %s with Queue arguments: %s".format(name, Utils.join(" ",command))) + CommandLineProgram.start(instance, command) + } catch { + case e: Exception => + gotAnException = true + if (expectedException != null) { + // we expect an exception + println("Wanted exception %s, saw %s".format(expectedException, e.getClass)) + if (expectedException.isInstance(e)) { + // it's the type we expected + println(String.format(" => %s PASSED (%s)", name, jobRunner)) + } else { + e.printStackTrace() + Assert.fail("Test %s expected exception %s but got %s instead (%s)".format( + name, expectedException, e.getClass, jobRunner)) + } + } else { + // we didn't expect an exception but we got one :-( + throw new RuntimeException(e) + } + } finally { + instance.shutdown() + runningCommandLines -= instance + } + + // catch failures from the integration test + if (expectedException != null) { + if (!gotAnException) + // we expected an exception but didn't see it + Assert.fail("Test %s expected exception %s but none was thrown (%s)".format(name, expectedException.toString, jobRunner)) + } else { + if (CommandLineProgram.result != 0) + throw new RuntimeException("Error running Queue with arguments: " + args) + } + } + + private def assertPathExists(runDir: String, path: String) { + val orig = new File(runDir, path) + var dir = orig.getParentFile + if (dir == null) + dir = new File(".") + Assert.assertTrue(dir.exists, "Missing directory: " + dir.getAbsolutePath) + val filter: FilenameFilter = new WildcardFileFilter(orig.getName) + Assert.assertNotEquals(dir.listFiles(filter).length, 0, "Missing file: " + orig.getAbsolutePath) + } + + private def assertPathDoesNotExist(runDir: String, path: String) { + val orig = new File(runDir, path) + var dir = orig.getParentFile + if (dir == null) + dir = new File(".") + if (dir.exists) { + val filter: FilenameFilter = new WildcardFileFilter(orig.getName) + Assert.assertEquals(dir.listFiles(filter).length, 0, + "Found unexpected file(s): " + dir.listFiles().map(_.getAbsolutePath).mkString(", ")) + } + } + + private var runningCommandLines = Set.empty[QCommandLine] + + Runtime.getRuntime.addShutdownHook(new Thread { + /** Cleanup as the JVM shuts down. */ + override def run() { + runningCommandLines.foreach(commandLine => + try { + commandLine.shutdown() + } catch { + case _: Throwable => /* ignore */ + }) + } + }) +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala diff --git a/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala new file mode 100644 index 000000000..3dc761382 --- /dev/null +++ b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala @@ -0,0 +1,68 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.pipeline + +class PipelineTestSpec(var name: String = null) { + + /** The arguments to pass to the Queue test, ex: "-S scala/qscript/examples/HelloWorld.scala" */ + var args: String = _ + + /** Job Queue to run the test. Default is null which means use hour. */ + var jobQueue: String = _ + + /** Job runners to run the test. Default is null which means use the default. */ + var jobRunners: Seq[String] = _ + + /** Expected MD5 results for each file path. */ + var fileMD5s = Map.empty[String, String] + + /** VariantEval validations to run on a VCF after the pipeline has completed. */ + var evalSpec: PipelineTestEvalSpec = _ + + /** Expected exception from the test. */ + var expectedException: Class[_ <: Exception] = null + + /** Expected files. The file name may contain wildcards acceptable by the WildcardFileFilter. */ + var expectedFilePaths: Seq[String] = Seq.empty + + /** Unexpected files. The file name may contain wildcards acceptable by the WildcardFileFilter. */ + var unexpectedFilePaths: Seq[String] = Seq.empty + + /** If true will check the MD5s without failing. */ + var parameterize = false + + def this(args: String, fileMD5s: Traversable[(String, String)]) = { + this() + this.args = args + this.fileMD5s = fileMD5s.toMap + } + + def this(args: String, expectedException: Class[_ <: Exception]) = { + this() + this.args = args + this.expectedException = expectedException + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala diff --git a/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExamplePrintReadsPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExamplePrintReadsPipelineTest.scala new file mode 100644 index 000000000..b9964d187 --- /dev/null +++ b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExamplePrintReadsPipelineTest.scala @@ -0,0 +1,83 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.pipeline.examples + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExamplePrintReadsPipelineTest { + @Test(timeOut=36000000) + def testDevNullOutput() { + val spec = new PipelineTestSpec + spec.name = "devnulloutput" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExamplePrintReads.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -out /dev/null").mkString + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testCleanupBai() { + val spec = new PipelineTestSpec + spec.name = "cleanupbai" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExamplePrintReads.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -out exampleOut.bam").mkString + spec.jobRunners = PipelineTest.allJobRunners + spec.unexpectedFilePaths :+= ".queue/scatterGather/ExamplePrintReads-1-sg/temp_1_of_1/exampleOut.bai" + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala diff --git a/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala new file mode 100644 index 000000000..b054164a1 --- /dev/null +++ b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -0,0 +1,111 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.pipeline.examples + +import org.testng.annotations.{DataProvider, Test} +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleUnifiedGenotyperPipelineTest { + @Test(timeOut=36000000) + def testUnifiedGenotyper() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -filter QD", + " -filterExpression 'QD < 2.0'").mkString + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @DataProvider(name = "ugIntervals") + def getUnifiedGenotyperIntervals = + Array( + Array("gatk_intervals", BaseTest.validationDataLocation + "intervalTest.intervals"), + Array("bed_intervals", BaseTest.validationDataLocation + "intervalTest.bed"), + Array("vcf_intervals", BaseTest.validationDataLocation + "intervalTest.1.vcf") + ).asInstanceOf[Array[Array[Object]]] + + @Test(dataProvider = "ugIntervals", timeOut=36000000) + def testUnifiedGenotyperWithIntervals(intervalsName: String, intervalsPath: String) { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_" + intervalsName + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + intervalsPath).mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testUnifiedGenotyperNoGCOpt() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_no_gc_opt" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -noGCOpt").mkString + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @DataProvider(name="resMemReqParams") + def getResMemReqParam = Array(Array("mem_free"), Array("virtual_free")).asInstanceOf[Array[Array[Object]]] + + @Test(dataProvider = "resMemReqParams", timeOut=36000000) + def testUnifiedGenotyperResMemReqParam(reqParam: String) { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_" + reqParam + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -resMemReqParam " + reqParam).mkString + spec.jobRunners = Seq("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testUnifiedGenotyperLogDirectory() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_log_directory" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -logDir exampleUGLogDir").mkString + spec.jobRunners = PipelineTest.allJobRunners + spec.expectedFilePaths :+= "exampleUGLogDir/exampleBAM.unfiltered.vcf.out" + spec.expectedFilePaths :+= "exampleUGLogDir/exampleBAM.unfiltered.eval.out" + PipelineTest.executeTest(spec) + } +} diff --git a/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala new file mode 100644 index 000000000..0f645cb2a --- /dev/null +++ b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -0,0 +1,152 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.pipeline.examples + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} + +class HelloWorldPipelineTest { + @Test(timeOut=36000000) + def testHelloWorld() { + val spec = new PipelineTestSpec + spec.name = "HelloWorld" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithRunName() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithRunName" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -runName HelloWorld" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithPriority() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithPriority" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobPriority 100" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithLsfResource() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfResource" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithLsfResourceAndMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25 -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithLsfEnvironment() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfEnvironment" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobEnv tv" + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithGridEngineResource() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineResource" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobResReq s_core=1000M" + spec.jobRunners = Seq("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithGridEngineResourceAndMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25 -jobResReq s_core=1000M" + spec.jobRunners = Seq("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithGridEngineEnvironment() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineEnvironment" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobEnv \"make 1\"" + spec.jobRunners = Seq("GridEngine") + PipelineTest.executeTest(spec) + } + + // disabled because our DRMAA implementation doesn't support wallTime + @Test(enabled=false, timeOut=36000000) + def testHelloWorldWithWalltime() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithWalltime" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -wallTime 100" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test(timeOut=36000000) + def testHelloWorldWithLogDirectory() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLogDirectory" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -logDir pipelineLogDir" + spec.jobRunners = PipelineTest.allJobRunners + spec.expectedFilePaths = Seq("pipelineLogDir/HelloWorld-1.out") + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala diff --git a/public/queue-package/pom.xml b/public/queue-package/pom.xml new file mode 100644 index 000000000..591f2c5bd --- /dev/null +++ b/public/queue-package/pom.xml @@ -0,0 +1,309 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + queue-package + jar + Queue Package + + + ${project.basedir}/../.. + prepare-package + package + Queue + org.broadinstitute.sting.queue.QCommandLine + + + + + + ${project.groupId} + queue-framework + ${project.version} + + + ${project.groupId} + gatk-package + ${project.version} + + + + org.scala-lang + scala-library + + + org.scala-lang + scala-compiler + + + + net.sf + picard + + + + javax.mail + mail + + + + net.java.dev.jna + jna + + + + com.google.code.cofoja + cofoja + + + + net.sf.snpeff + snpeff + + + + ${project.groupId} + gatk-framework + ${project.version} + example-resources + tar.bz2 + + + ${project.groupId} + queue-framework + ${project.version} + example-resources + tar.bz2 + + + + ${project.groupId} + queue-framework + ${project.version} + test-jar + test + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + unit-tests + + ${sting.serialunittests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + integration-tests + + ${sting.serialintegrationtests.skipped} + + org.broadinstitute.sting:.* + + + + + pipeline-tests + + ${sting.serialpipelinetests.skipped} + + org.broadinstitute.sting:.* + + + + + large-scale-tests + + ${sting.seriallargescaletests.skipped} + + org.broadinstitute.sting:.* + + + + + knowledge-base-tests + + ${sting.serialknowledgebasetests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-direct-dependencies + ${sting.unpack.phase} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + sting-executable + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + binary-dist + ${sting.shade.phase} + + + + + + com.pyx4j + maven-junction-plugin + + + link-binary-jar + ${sting.shade.phase} + + + link-git-release + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-install-plugin + + + default-install + none + + + install-package + install + + + + + + + + + + private + + + ${basedir}/../../private/queue-private/pom.xml + + + + + ${project.groupId} + queue-private + ${project.version} + true + + + ${project.groupId} + queue-private + ${project.version} + test-jar + test + true + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + link-private-qscript + process-test-resources + + + unlink-private-qscript + clean + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + none + none + + + + + diff --git a/public/queue-package/src/main/assembly/binary-dist.xml b/public/queue-package/src/main/assembly/binary-dist.xml new file mode 100644 index 000000000..6de236a56 --- /dev/null +++ b/public/queue-package/src/main/assembly/binary-dist.xml @@ -0,0 +1,23 @@ + + binary-dist + + tar.bz2 + + false + + + + org.broadinstitute.sting:queue-package + + ${sting.binary-dist.name}.${artifact.extension} + + + resources + true + + org.broadinstitute.sting:gatk-framework:tar.bz2:example-resources + org.broadinstitute.sting:queue-framework:tar.bz2:example-resources + + + + diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.jar b/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.jar similarity index 100% rename from settings/repository/com.google.code.cofoja/cofoja-1.0-r139.jar rename to public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.jar diff --git a/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.pom b/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.pom new file mode 100644 index 000000000..5a6fb69b9 --- /dev/null +++ b/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.pom @@ -0,0 +1,9 @@ + + 4.0.0 + com.google.code.cofoja + cofoja + cofoja + 1.0-r139 + diff --git a/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.jar b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.jar new file mode 100644 index 000000000..089b71385 Binary files /dev/null and b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.jar differ diff --git a/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.pom b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.pom new file mode 100644 index 000000000..fd8a61917 --- /dev/null +++ b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.pom @@ -0,0 +1,44 @@ + + + 4.0.0 + net.sf + picard + 1.107.1683 + picard + + + net.sf + sam + 1.107.1683 + + + org.broadinstitute + variant + 1.107.1683 + + + org.broad + tribble + 1.107.1683 + + + + org.apache.ant + ant + 1.8.2 + + + org.apache.ant + ant-launcher + + + + + com.sun + tools.jar + 1.5 + system + ${java.home}/../lib/tools.jar + + + diff --git a/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.jar b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.jar new file mode 100644 index 000000000..928838707 Binary files /dev/null and b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.jar differ diff --git a/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.pom b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.pom new file mode 100644 index 000000000..89114f546 --- /dev/null +++ b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.pom @@ -0,0 +1,21 @@ + + + 4.0.0 + net.sf + sam + 1.107.1683 + sam-jdk + + + org.testng + testng + 5.5 + jdk15 + + + org.xerial.snappy + snappy-java + 1.0.3-rc3 + + + diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.5.jar b/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.jar similarity index 100% rename from settings/repository/net.sf.snpeff/snpeff-2.0.5.jar rename to public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.jar diff --git a/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.pom b/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.pom new file mode 100644 index 000000000..d316e2055 --- /dev/null +++ b/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.pom @@ -0,0 +1,9 @@ + + 4.0.0 + net.sf + snpeff + snpeff + 2.0.5 + diff --git a/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.jar b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.jar new file mode 100644 index 000000000..efa04ad2c Binary files /dev/null and b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.jar differ diff --git a/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.pom b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.pom new file mode 100644 index 000000000..7bf169bd4 --- /dev/null +++ b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.pom @@ -0,0 +1,15 @@ + + + 4.0.0 + org.broad + tribble + 1.107.1683 + tribble + + + net.sf + sam + 1.107.1683 + + + diff --git a/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.jar b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.jar new file mode 100644 index 000000000..ea4ebe35e Binary files /dev/null and b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.jar differ diff --git a/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.pom b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.pom new file mode 100644 index 000000000..256963812 --- /dev/null +++ b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.pom @@ -0,0 +1,31 @@ + + + 4.0.0 + org.broadinstitute + variant + 1.107.1683 + variant + + + org.broad + tribble + 1.107.1683 + + + net.sf + sam + 1.107.1683 + + + org.apache.commons + commons-jexl + 2.1.1 + + + + com.google.code.cofoja + cofoja + 1.0-r139 + + + diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala deleted file mode 100644 index 7dd771873..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ /dev/null @@ -1,532 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.qscripts.CNV - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.util.VCF_BAM_utilities -import org.broadinstitute.sting.queue.util.DoC._ -import org.broadinstitute.sting.commandline.Hidden -import java.io.{PrintStream, PrintWriter} -import org.broadinstitute.sting.utils.text.XReadLines -import collection.JavaConversions._ -import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils - -class xhmmCNVpipeline extends QScript { - qscript => - - @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) - var bams: File = _ - - @Input(doc = "gatk jar file", shortName = "J", required = true) - var gatkJarFile: File = _ - - @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) - var xhmmExec: File = _ - - @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) - var pseqExec: File = _ - - @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) - var pseqSeqDB: String = _ - - @Input(shortName = "R", doc = "ref", required = true) - var referenceFile: File = _ - - @Input(shortName = "L", doc = "Intervals", required = false) - var intervals: File = _ - - @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) - var scatterCountInput = 0 - - @Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) - var samplesPerJob = 1 - - @Output(doc = "Base name for files to output", shortName = "o", required = true) - var outputBase: File = _ - - @Hidden - @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) - var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS - - @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) - var MAX_DEPTH = 20000 - - @Hidden - @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) - var NUM_BINS = 200 - - @Hidden - @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) - var START_BIN = 1 - - @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) - var minMappingQuality = 0 - - @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false) - var minBaseQuality = 0 - - @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) - var wholeMatrixMemory = -1 - - @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) - var minTargGC : Double = 0.1 - - @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false) - var maxTargGC : Double = 0.9 - - @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false) - var minTargRepeats : Double = 0.0 - - @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false) - var maxTargRepeats : Double = 0.1 - - @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false) - var sampleIDsMap: String = "" - - @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false) - var sampleIDsMapFromColumn = 1 - - @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false) - var sampleIDsMapToColumn = 2 - - @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false) - var targetSampleFiltersString: String = "" - - @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false) - var PCAnormalizeMethodString: String = "" - - @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false) - var targetSampleNormalizedFiltersString: String = "" - - @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) - var xhmmParamsArg: File = _ - - @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false) - var discoverCommandLineParams: String = "" - - @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) - var genotypeCommandLineParams: String = "" - - @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false) - var genotypeSubsegments: Boolean = false - - @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false) - var maxTargetsInSubsegment = 30 - - @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) - var subsegmentGenotypeThreshold = 20.0 - - @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) - var longJobQueue: String = "" - - - val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list" - - val RD_OUTPUT_SUFFIX: String = ".RD.txt" - - val TARGS_GC_SUFFIX = ".locus_GC.txt" - val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt" - - val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt" - val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt" - - val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt" - val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt" - - - trait WholeMatrixMemoryLimit extends CommandLineFunction { - // Since loading ALL of the data can take significant memory: - if (wholeMatrixMemory < 0) { - this.memoryLimit = 24 - } - else { - this.memoryLimit = wholeMatrixMemory - } - } - - trait LongRunTime extends CommandLineFunction { - if (longJobQueue != "") - this.jobQueue = longJobQueue - } - - def script = { - val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile) - add(prepTargets) - - trait CommandLineGATKArgs extends CommandLineGATK { - this.intervals :+= prepTargets.out - this.jarFile = qscript.gatkJarFile - this.reference_sequence = qscript.referenceFile - this.logging_level = "INFO" - } - - val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) - val samples: List[String] = sampleToBams.keys.toList - Console.out.printf("Samples are %s%n", samples) - - val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase) - var docs: List[DoC] = List[DoC]() - for (group <- groups) { - Console.out.printf("Group is %s%n", group) - docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs - } - addAll(docs) - - val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit - add(mergeDepths) - - var excludeTargets : List[File] = List[File]() - if (minTargGC > 0 || maxTargGC < 1) { - val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs - calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX - add(calcGCcontents) - - val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC) - add(excludeTargetsBasedOnGC) - excludeTargets ::= excludeTargetsBasedOnGC.out - } - - class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction { - @Input(doc="") - var intervals: File = prepTargets.out - - @Output(doc="") - var out : File = new File(outFile) - - val regFile : String = outputBase.getPath + ".targets.reg" - val locDB : String = outputBase.getPath + ".targets.LOCDB" - - val removeFiles = "rm -f " + regFile + " " + locDB - val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile - val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load" - val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats" - val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out - - var command: String = - removeFiles + - " && " + createRegFile + - " && " + createLOCDB + - " && " + calcRepeatMaskedPercent + - " && " + extractRepeatMaskedPercent - - def commandLine = command - - override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command - } - - if (minTargRepeats > 0 || maxTargRepeats < 1) { - val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX) - add(calcRepeatComplexity) - - val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats) - add(excludeTargetsBasedOnRepeats) - excludeTargets ::= excludeTargetsBasedOnRepeats.out - } - - val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets) - add(filterCenterDepths) - - val pca = new PCA(filterCenterDepths.filteredCentered) - add(pca) - - val normalize = new Normalize(pca) - add(normalize) - - val filterZscore = new FilterAndZscoreNormalized(normalize.normalized) - add(filterZscore) - - val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore) - add(filterOriginal) - - val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) - add(discover) - - val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) - add(genotype) - - if (genotypeSubsegments) { - val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) - add(genotypeSegs) - } - } - - class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { - @Input(doc="") - var locus_value : File = locus_valueIn - - @Output(doc="") - var out : File = new File(outputBase.getPath + outSuffix) - - def run = { - var outWriter = new PrintWriter(new PrintStream(out)) - var elems = asScalaIterator(new XReadLines(locus_value)) - - while (elems.hasNext) { - val line = elems.next - val splitLine = line.split("\\s+") - val locus = splitLine(0) - val locValStr = splitLine(1) - try { - val locVal = locValStr.toDouble - if (locVal < minVal || locVal > maxVal) - outWriter.printf("%s%n", locus) - } - catch { - case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus) - case e: Exception => throw e - } - } - - outWriter.close - } - } - - class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val excludeTargets = excludeTargetsIn - - @Output - val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX) - @Output - val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX) - @Output - val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX) - - var command: String = - xhmmExec + " --matrix" + - " -r " + input + - " --centerData --centerType target" + - " -o " + filteredCentered + - " --outputExcludedTargets " + filteredTargets + - " --outputExcludedSamples " + filteredSamples - command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) - if (targetSampleFiltersString != "") - command += " " + targetSampleFiltersString - - def commandLine = command - - override def description = "Filters samples and targets and then mean-centers the targets: " + command - } - - class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - val PCAbase: String = outputBase.getPath + ".RD_PCA" - - @Output - val outPC: File = new File(PCAbase + ".PC.txt") - @Output - val outPC_SD: File = new File(PCAbase + ".PC_SD.txt") - @Output - val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt") - - var command: String = - xhmmExec + " --PCA" + - " -r " + input + - " --PCAfiles " + PCAbase - - def commandLine = command - - override def description = "Runs PCA on mean-centered data: " + command - } - - class Normalize(pca: PCA) extends CommandLineFunction { - @Input(doc = "") - val input = pca.input - - @Input(doc = "") - val inPC = pca.outPC - - @Input(doc = "") - val inPC_SD = pca.outPC_SD - - @Input(doc = "") - val inPC_LOADINGS = pca.outPC_LOADINGS - - @Output - val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt") - - var command: String = - xhmmExec + " --normalize" + - " -r " + input + - " --PCAfiles " + pca.PCAbase + - " --normalizeOutput " + normalized - if (PCAnormalizeMethodString != "") - command += " " + PCAnormalizeMethodString - - def commandLine = command - - override def description = "Normalizes mean-centered data using PCA information: " + command - } - - class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - @Output - val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX) - @Output - val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX) - @Output - val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX) - - var command: String = - xhmmExec + " --matrix" + - " -r " + input + - " --centerData --centerType sample --zScoreData" + - " -o " + filteredZscored + - " --outputExcludedTargets " + filteredTargets + - " --outputExcludedSamples " + filteredSamples - if (targetSampleNormalizedFiltersString != "") - command += " " + targetSampleNormalizedFiltersString - - def commandLine = command - - override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command - } - - class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u)) - - @Input(doc = "") - val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u)) - - @Output - val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX) - - var command: String = - xhmmExec + " --matrix" + - " -r " + input + - targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + - sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) + - " -o " + sameFiltered - - def commandLine = command - - override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command - } - - class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val xhmmParams = xhmmParamsArg - - @Input(doc = "") - val origRD = origRDParam - - @Output - val xcnv: File = new File(outputBase.getPath + ".xcnv") - - @Output - val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv") - - val posteriorsBase = outputBase.getPath - - @Output - val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt") - - @Output - val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt") - - @Output - val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt") - - var command: String = - xhmmExec + " --discover" + - " -p " + xhmmParams + - " -r " + input + - " -R " + origRD + - " -c " + xcnv + - " -a " + aux_xcnv + - " -s " + posteriorsBase + - " " + discoverCommandLineParams - - def commandLine = command - - override def description = "Discovers CNVs in normalized data: " + command - } - - abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val xhmmParams = xhmmParamsArg - - @Input(doc = "") - val origRD = origRDParam - - @Input(doc = "") - val inXcnv = xcnv - - var command: String = - xhmmExec + " --genotype" + - " -p " + xhmmParams + - " -r " + input + - " -g " + inXcnv + - " -F " + referenceFile + - " -R " + origRD + - " " + genotypeCommandLineParams - } - - class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { - @Output - val vcf: File = new File(outputBase.getPath + ".vcf") - - command += - " -v " + vcf - - def commandLine = command - - override def description = "Genotypes discovered CNVs in all samples: " + command - } - - class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { - @Output - val vcf: File = new File(outputBase.getPath + ".subsegments.vcf") - - command += - " -v " + vcf + - " --subsegments" + - " --maxTargetsInSubsegment " + maxTargetsInSubsegment + - " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold - - def commandLine = command - - override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command - } -} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala deleted file mode 100644 index 307ce171f..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ /dev/null @@ -1,420 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException -import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.JavaCommandLineFunction - -class GATKResourcesBundle extends QScript { - // todo -- update to released version when things stabilize - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") - - @Argument(doc="liftOverPerl", required=false) - var liftOverPerl: File = new File("./public/perl/liftOverVCF.pl") - - @Argument(shortName = "ver", doc="The GIT version of this release", required=true) - var BUNDLE_VERSION: String = _ - - @Argument(shortName = "bundleDir", doc="Path to root where resource files will be placed", required=false) - val BUNDLE_ROOT = new File("/humgen/gsa-hpprojects/GATK/bundle") - - @Argument(shortName = "downloadDir", doc="Path to root where resource files will be placed for users to download", required=false) - val DOWNLOAD_ROOT = new File("/humgen/gsa-scr1/pub/bundle") - - @Argument(shortName = "test", doc="", required=false) - val TEST = false - - @Argument(shortName = "phase2", doc="", required=false) - val DO_DOWNLOAD = false - - val SITES_EXT: String = "sites" - - def BUNDLE_DIR: File = BUNDLE_ROOT + "/" + BUNDLE_VERSION - def DOWNLOAD_DIR: File = DOWNLOAD_ROOT + "/" + BUNDLE_VERSION - - // REFERENCES - class Reference( val name: String, val file: File ) { } - var hg19: Reference = _ - var b37: Reference = _ - var hg18: Reference = _ - var b36: Reference = _ - var exampleFASTA: Reference = _ - var refs: List[Reference] = _ - - class Resource(val file: File, val name: String, val ref: Reference, val useName: Boolean = true, val makeSites: Boolean = true, val makeCallsIfBam: Boolean = true ) { - def destname(target: Reference): String = { - if ( useName ) - return name + "." + target.name + "." + getExtension(file) - else - return file.getName - } - } - - def liftover(in: File, inRef: Reference, out: File, outRef: Reference): CommandLineFunction = { - //Console.printf("liftover(%s => %s)%n", inRef.name, outRef.name) - (inRef.name, outRef.name) match { - case ("b37", "hg19") => - return new LiftOverPerl(in, out, new File("public/chainFiles/b37tohg19.chain"), inRef, outRef) - case ("b37", "hg18") => - return new LiftOverPerl(in, out, new File("public/chainFiles/b37tohg18.chain"), inRef, outRef) - case ("b37", "b36") => - return new LiftOverPerl(in, out, new File("public/chainFiles/b37tob36.chain"), inRef, outRef) - case _ => return null - } - } - - def isVCF(file: File) = file.getName.endsWith(".vcf") - def isBAM(file: File) = file.getName.endsWith(".bam") - def isOUT(file: File) = file.getName.endsWith(".out") - def isFASTA(file: File) = file.getName.endsWith(".fasta") - def isIntervalList(file: File) = file.getName.endsWith(".interval_list") - - var RESOURCES: List[Resource] = Nil - def addResource(comp: Resource) { RESOURCES = comp :: RESOURCES } - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.memoryLimit = 2 - } - - def initializeTestDataFiles() = { - // - // Standard evaluation files for indel - // - b37 = new Reference("b37", new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")) - hg18 = new Reference("hg18", new File("/Users/depristo/Desktop/broadLocal/localData/Homo_sapiens_assembly18.fasta")) - exampleFASTA = new Reference("exampleFASTA", new File("public/testdata/exampleFASTA.fasta")) - refs = List(b37, hg18, exampleFASTA) - - val DATAROOT = "/Users/depristo/Desktop/broadLocal/localData/" - //addResource(new Resource(DATAROOT + "human_g1k_v37.fasta", "human_g1k_v37.fasta", b37, false)) - addResource(new Resource(DATAROOT + "1000G.snp.validation.b37.vcf", "1000G.snp.validation", b37)) - addResource(new Resource(DATAROOT + "dbsnp_132_b37.vcf", "dbsnp_132", b37, true, false)) - - addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false)) - addResource(new Resource("public/testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false, false, false)) - } - - def initializeStandardDataFiles() = { - // - // references - // - hg19 = new Reference("hg19", new File("/humgen/gsa-hpprojects/GATK/data/ucsc.hg19/ucsc.hg19.fasta")) - b37 = new Reference("b37", new File("/humgen/1kg/reference/human_g1k_v37.fasta")) - hg18 = new Reference("hg18", new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")) - b36 = new Reference("b36", new File("/humgen/1kg/reference/human_b36_both.fasta")) - exampleFASTA = new Reference("exampleFASTA", new File("public/testdata/exampleFASTA.fasta")) - refs = List(hg19, b37, hg18, b36, exampleFASTA) - - addResource(new Resource(b37.file, "", b37, false)) - addResource(new Resource(b36.file, "", b36, false)) - addResource(new Resource(hg19.file, "", hg19, false)) - addResource(new Resource(hg18.file, "", hg18, false)) - - // - // The b37_decoy reference - // - addResource(new Resource("/humgen/1kg/reference/human_g1k_v37_decoy.fasta", - "IGNORE", b37, false, false)) - - // - // standard VCF files. Will be lifted to each reference - // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_138_b37.leftAligned.vcf", - "dbsnp_138", b37, true, false)) - - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", - "1000G_omni2.5", b37, true, false)) - - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf", - "hapmap_3.3", b37, true, false)) - - addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", - "1000G_phase1.indels", b37, true, false)) - - addResource(new Resource("/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf", - "1000G_phase1.snps.high_confidence", b37, true, false)) - - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", - "Mills_and_1000G_gold_standard.indels", b37, true, false)) - - // - // CEU trio (NA12878,NA12891,NA12892) best practices results (including PBT) - // - - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/callsets/CEUtrio_BestPractices/CEUTrio.HiSeq.WGS.b37.snps_and_indels.recalibrated.filtered.phased.CURRENT.vcf", - "CEUTrio.HiSeq.WGS.b37.bestPractices.phased",b37,true,false)) - - // - // example call set for documentation guide tutorial - // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", - "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) - - // - // Test BAM file, only for the b37 reference - // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.NA12878.bam", - "IGNORE", b37, false, false)) - - // - // Exome targets file, only for the b37 reference - // - addResource(new Resource("/seq/references/HybSelOligos/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", - "Broad.human.exome", b37, true, false, false)) - - // - // refGene files specific to each reference - // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/refGene_b37.sorted.txt", - "refGene", b37, true, false)) - - addResource(new Resource("public/chainFiles/hg18tob37.chain", "", hg18, false, false)) - addResource(new Resource("public/chainFiles/b36tob37.chain", "", b36, false, false)) - - // todo -- chain files? - // todo 1000G SNP and indel call sets? - - // - // exampleFASTA file - // - addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false)) - addResource(new Resource("public/testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false, false, false)) - } - - def createBundleDirectories(dir: File) = { - if ( ! dir.exists ) dir.mkdirs() - - for ( ref <- refs ) { - val refDir = new File(dir + "/" + ref.name) - if ( ! refDir.exists ) refDir.mkdirs() - } - } - - def createCurrentLink(bundleDir: File) = { - - val currentLink = new File(BUNDLE_ROOT + "/current") - - if ( currentLink.exists ) add(new deleteLink(currentLink)) - - add(new linkFile(bundleDir, currentLink)) - } - - def script = { - if ( TEST ) - initializeTestDataFiles(); - else - initializeStandardDataFiles(); - - if ( ! DO_DOWNLOAD ) { - // create destination directory structure - createBundleDirectories(BUNDLE_DIR) - - for ( resource: Resource <- RESOURCES ) { - if ( isFASTA(resource.file) ) { - copyBundleFasta(resource, resource.ref) - } else if ( isBAM(resource.file) ) { - val f = copyBundleFile(resource, resource.ref) - add(new IndexBAM(f)) - if ( resource.makeCallsIfBam ) { - @Output val outvcf: File = swapExt(f.getParent, f, ".bam", ".vcf") - add(new UG(resource.file, resource.ref.file, outvcf)) - } - } else if ( isVCF(resource.file) ) { - for ( destRef <- refs ) { - val out = destFile(BUNDLE_DIR, destRef, resource.destname(destRef)) - var continue = true - - // copy or lift over the original vcf - if ( resource.ref == destRef ) { - add(new cpFile(resource.file, out)) - } else { - val clf = liftover(resource.file, resource.ref, out, destRef) - if ( clf != null ) { - add(clf) - } else { - continue = false - } - } - - if ( continue ) { - add(new IndexVCF(out, destRef.file)) - - if ( resource.makeSites ) { - val sites: Resource = new Resource(swapExt(out.getParent, out, ".vcf", "." + SITES_EXT + ".vcf"), "", destRef, false) - add(new JustSites(out, sites.file)) - add(new IndexVCF(sites.file, destRef.file)) - } - - if ( resource.name.contains("dbsnp") ) { - val dbsnp129: Resource = new Resource(swapExt(out.getParent, out, ".vcf", ".excluding_sites_after_129.vcf"), "", destRef, false) - add(new MakeDBSNP129(out, destRef.file, dbsnp129.file)) - add(new IndexVCF(dbsnp129.file, destRef.file)) - } - } - } - } else if ( isIntervalList(resource.file) ) { - val out = destFile(BUNDLE_DIR, resource.ref, resource.destname(resource.ref)) - add(new cpFile(resource.file, out)) - } else { - //throw new ReviewedStingException("Unknown file type: " + resource) - } - } - - createCurrentLink(BUNDLE_DIR) - - } else { - createBundleDirectories(DOWNLOAD_DIR) - createDownloadsFromBundle(BUNDLE_DIR, DOWNLOAD_DIR) - } - } - - - def createDownloadsFromBundle(in: File, out: File) { - Console.printf("Visiting %s%n", in) - if (! in.getName.startsWith(".")) { - if ( in.isDirectory ) { - out.mkdirs - - for ( child: File <- in.listFiles ) { - createDownloadsFromBundle(child, out + "/" + child.getName) - } - } else { - if ( isBAM(in) ) { - add(new cpFile(in, out)) - add(new md5sum(out)) - } else if ( !isOUT(in) ) { - add(new GzipFile(in, out + ".gz")) - add(new md5sum(out + ".gz")) - } - - } - } - } - - def copyBundleFasta(res: Resource, ref: Reference) { - val out = destFile(BUNDLE_DIR, ref, res.destname(ref)) - add(new cpFile(res.file, out)) - - val oldRefDict = swapExt(res.file.getParent, res.file, ".fasta", ".dict") - val newRefDict = swapExt(out.getParent, out, ".fasta", ".dict") - - val oldRefFai = swapExt(res.file.getParent, res.file, ".fasta", ".fasta.fai") - val newRefFai = swapExt(out.getParent, out, ".fasta", ".fasta.fai") - - add(new cpFile(oldRefDict, newRefDict)) - add(new cpFile(oldRefFai, newRefFai)) - } - - def copyBundleFile(res: Resource, ref: Reference): File = { - val out = destFile(BUNDLE_DIR, ref, res.destname(ref)) - add(new cpFile(res.file, out)) - return out - } - - def destFile(dir: File, ref: Reference, f: File): File = { - return destFile(dir, ref, f.getName) - } - - def destFile(dir: File, ref: Reference, name: String): File = { - return new File(dir + "/" + ref.name + "/" + name) - } - - /** - * A command line (cut) that removes all genotyping information from a file - */ - class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { - def commandLine = "cut -f 1-8 %s > %s".format(in, out) - } - - class GzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { - def commandLine = "gzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath) - } - - class cpFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { - def commandLine = "cp %s %s".format(in.getAbsolutePath, out.getAbsolutePath) - } - - class deleteLink(@Input val in: File) extends CommandLineFunction { - def commandLine = "rm %s".format(in.getAbsolutePath) - } - - class linkFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { - def commandLine = "ln -s %s %s".format(in.getAbsolutePath, out.getAbsolutePath) - } - - class md5sum(@Input val in: File) extends CommandLineFunction { - @Output val o: File = new File(in.getAbsolutePath + ".md5") - def commandLine = "md5sum %s > %s".format(in.getAbsolutePath, o) - } - - class IndexBAM(bamIn: File) extends SamtoolsIndexFunction { - bamFile = bamIn - } - - class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRODs with UNIVERSAL_GATK_ARGS { - //@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx") - this.rod :+= vcf - this.reference_sequence = ref - } - - class UG(@Input bam: File, @Input ref: File, @Input outVCF: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { - this.input_file = List(bam) - this.reference_sequence = ref - this.intervalsString ++= List("20"); - this.out = outVCF - } - - class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.variant = dbsnp - this.select ++= List("dbSNPBuildID <= 129") - this.reference_sequence = ref - this.out = dbsnp129 - } - - class LiftOverPerl(@Input val in: File, @Output val out: File, @Input val chain: File, oldRef: Reference, newRef: Reference) extends CommandLineFunction { - this.memoryLimit = 12 - def commandLine = ("%s -vcf %s -chain %s -out %s " + - "-gatk ./ -newRef %s -oldRef %s -tmp %s").format(liftOverPerl, in.getAbsolutePath, chain, - out.getAbsolutePath, newRef.file.replace(".fasta", ""), - oldRef.file.replace(".fasta", ""), jobTempDir) - } - - def getExtension(f: File): String = { - val i = f.getName.lastIndexOf('.'); - if (i > 0 && i < f.getName.length() - 1) - return f.getName.substring(i+1).toLowerCase(); - else - return ""; - } -} - diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala deleted file mode 100644 index 0021f5ae5..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.qscripts.examples - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ - -/** - * Script used for testing output to /dev/null - */ -class DevNullOutput extends QScript { - @Input(doc="The reference file for the bam files.", shortName="R") - var referenceFile: File = _ - - @Input(doc="Bam file to genotype.", shortName="I") - var bamFile: File = _ - - def script() { - val printReads = new PrintReads - printReads.reference_sequence = referenceFile - printReads.memoryLimit = 2 - printReads.scatterCount = 3 - printReads.input_file :+= bamFile - printReads.out = "/dev/null" - add(printReads) - } -} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala deleted file mode 100644 index fc1d4599e..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue - -import engine.JobRunInfo -import org.broadinstitute.sting.queue.function.QFunction -import annotation.target.field -import util._ -import org.broadinstitute.sting.commandline.ArgumentSource - -/** - * Defines a Queue pipeline as a collection of CommandLineFunctions. - */ -trait QScript extends Logging with PrimitiveOptionConversions with StringFileConversions { - - // Type aliases so users don't have to import - type File = java.io.File - type CommandLineFunction = org.broadinstitute.sting.queue.function.CommandLineFunction - type InProcessFunction = org.broadinstitute.sting.queue.function.InProcessFunction - type ScatterGatherableFunction = org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction - type SimpleTextGatherFunction = org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction - - // Make sure annotations can be used in class constructors but target the fields - // ex: class MyClass(@Input var myVar: File) {} - // This was implicitly enabled in 2.8.0-RC2 and then updated to this new syntax: - // http://lampsvn.epfl.ch/trac/scala/ticket/3596 - // http://lampsvn.epfl.ch/trac/scala/ticket/3421 - type Input = org.broadinstitute.sting.commandline.Input @field - type Output = org.broadinstitute.sting.commandline.Output @field - type Argument = org.broadinstitute.sting.commandline.Argument @field - type ArgumentCollection = org.broadinstitute.sting.commandline.ArgumentCollection @field - type Gather = org.broadinstitute.sting.commandline.Gather @field - - /** - * Default settings for QFunctions - */ - var qSettings: QSettings = _ - - /** - * Builds the CommandLineFunctions that will be used to run this script and adds them to this.functions directly or using the add() utility method. - */ - def script() - - /** - * A default handler for the onExecutionDone() function. By default this doesn't do anything - */ - def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) { - } - - /** - * The command line functions that will be executed for this QScript. - */ - var functions = Seq.empty[QFunction] - - /** - * Exchanges the extension on a file. - * @param file File to look for the extension. - * @param oldExtension Old extension to strip off, if present. - * @param newExtension New extension to append. - * @return new File with the new extension in the current directory. - */ - protected def swapExt(file: File, oldExtension: String, newExtension: String) = - new File(file.getName.stripSuffix(oldExtension) + newExtension) - - /** - * Exchanges the extension on a file. - * @param dir New directory for the file. - * @param file File to look for the extension. - * @param oldExtension Old extension to strip off, if present. - * @param newExtension New extension to append. - * @return new File with the new extension in dir. - */ - protected def swapExt(dir: File, file: File, oldExtension: String, newExtension: String) = - new File(dir, file.getName.stripSuffix(oldExtension) + newExtension) - - /** - * Adds one or more command line functions to be run. - * @param functions Functions to add. - */ - def add(functions: QFunction*) { - functions.foreach(function => function.addOrder = QScript.nextAddOrder) - this.functions ++= functions - } - - def addAll(functions: Traversable[QFunction]) { - functions.foreach( f => add(f) ) - } - - /** - * Convert all @Output files to remote output files. - * @param remoteFileConverter Converter for files to remote files. - */ - def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) { - for (field <- outputFields) { - val fieldFile = ClassFieldCache.getFieldFile(this, field) - if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) { - val fieldName = ClassFieldCache.fullName(field) - val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName) - ClassFieldCache.setFieldValue(this, field, remoteFile) - } - } - } - - /** - * Pull all remote files to the local disk - */ - def pullInputs() { - val inputs = ClassFieldCache.getFieldFiles(this, inputFields) - for (remoteFile <- filterRemoteFiles(inputs)) { - logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) - remoteFile.pullToLocal() - } - } - - /** - * Push all remote files from the local disk - */ - def pushOutputs() { - val outputs = ClassFieldCache.getFieldFiles(this, outputFields) - for (remoteFile <- filterRemoteFiles(outputs)) { - logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) - remoteFile.pushToRemote() - } - } - - private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] = - fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]) - /** - * @return the inputs or null if there are no inputs - */ - def remoteInputs: AnyRef = null - - /** - * @return the outputs or null if there are no outputs - */ - def remoteOutputs: AnyRef = null - - /** The complete list of fields. */ - def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass) - /** The @Input fields. */ - def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.getClass) - /** The @Output fields. */ - def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.getClass) - /** The @Argument fields. */ - def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.getClass) -} - -object QScript { - private var addOrder = 0 - private def nextAddOrder = { - addOrder += 1 - Seq(addOrder) - } -} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala deleted file mode 100644 index e74e235d4..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ /dev/null @@ -1,98 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue - -import java.io.File -import org.broadinstitute.sting.commandline.{ClassType, Argument} - -/** - * Default settings settable on the command line and passed to CommandLineFunctions. - */ -class QSettings { - @Argument(fullName="run_name", shortName="runName", doc="A name for this run used for various status messages.", required=false) - var runName: String = _ - - @Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false) - var jobProject: String = _ - - @Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false) - var jobQueue: String = _ - - @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) - @ClassType(classOf[Int]) - var jobPriority: Option[Int] = None - - @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) - var jobNativeArgs: Seq[String] = Nil - - @Argument(fullName="job_resource_request", shortName="jobResReq", doc="Resource requests to pass to the job runner.", required=false) - var jobResourceRequests: Seq[String] = Nil - - @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) - var jobEnvironmentNames: Seq[String] = Nil - - @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) - @ClassType(classOf[Double]) - var memoryLimit: Option[Double] = Some(2) - - @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) - @ClassType(classOf[Double]) - var memoryLimitThreshold: Option[Double] = None - - @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) - @ClassType(classOf[Double]) - var residentLimit: Option[Double] = None - - @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) - @ClassType(classOf[Double]) - var residentRequest: Option[Double] = None - - @Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false) - var residentRequestParameter: String = _ - - @Argument(fullName="job_walltime", shortName="wallTime", doc="Setting the required DRMAA walltime or LSF run limit.", required=false) - @ClassType(classOf[Long]) - var jobWalltime: Option[Long] = None - - /** The name of the parallel environment (required for SGE, for example) */ - @Argument(fullName="job_parallel_env", shortName="jobParaEnv", doc="An SGE style parallel environment to use for jobs requesting more than 1 core. Equivalent to submitting jobs with -pe ARG nt for jobs with nt > 1", required=false) - var parallelEnvironmentName: String = "smp_pe" // Broad default - - @Argument(fullName="dontRequestMultipleCores", shortName="multiCoreJerk", doc="If provided, Queue will not request multiple processors for jobs using multiple processors. Sometimes you eat the bear, sometimes the bear eats you.", required=false) - var dontRequestMultipleCores: Boolean = false - - @Argument(fullName="disableDefaultJavaGCOptimizations", shortName="noGCOpt", doc="If provided, Queue will not ensure that java GC threads are limited and that the a minimum amount of time is spent in GC.") - var disableDefaultJavaGCOptimizations = false - - @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) - var runDirectory = new File(".") - - @Argument(fullName="temp_directory", shortName="tempDir", doc="Temp directory to pass to functions.", required=false) - var tempDirectory = new File(System.getProperty("java.io.tmpdir")) - - @Argument(fullName="job_scatter_gather_directory", shortName="jobSGDir", doc="Default directory to place scatter gather output for compute farm jobs.", required=false) - var jobScatterGatherDirectory: File = _ -} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala deleted file mode 100644 index aaddeb28b..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.extensions.gatk - -import org.broadinstitute.sting.queue.function.scattergather.GatherFunction -import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} -import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor -import org.broadinstitute.sting.queue.util.ClassFieldCache - -/** - * Merges BAM files using net.sf.picard.sam.MergeSamFiles. - */ -class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit { - this.javaMainClass = "net.sf.picard.sam.MergeSamFiles" - this.assumeSorted = Some(true) - protected def inputBams = gatherParts - protected def outputBam = originalOutput - - override def freezeFieldValues() { - val originalGATK = originalFunction.asInstanceOf[CommandLineGATK] - - // Whatever the original function can handle, merging *should* do less. - this.memoryLimit = originalFunction.memoryLimit - - // bam_compression and index_output_bam_on_the_fly from SAMFileWriterArgumentTypeDescriptor - // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK - - val compression = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.COMPRESSION_FULLNAME) - this.compressionLevel = originalGATK.getFieldValue(compression).asInstanceOf[Option[Int]] - - val disableIndex = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME) - this.createIndex = Some(!originalGATK.getFieldValue(disableIndex).asInstanceOf[Boolean]) - - val enableMD5 = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME) - this.createMD5 = Some(originalGATK.getFieldValue(enableMD5).asInstanceOf[Boolean]) - - super.freezeFieldValues() - } -} diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala deleted file mode 100644 index 3afd289af..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ /dev/null @@ -1,506 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.function - -import java.io.File -import java.lang.annotation.Annotation -import org.broadinstitute.sting.commandline._ -import org.broadinstitute.sting.queue.{QException, QSettings} -import java.lang.IllegalStateException -import org.broadinstitute.sting.queue.util._ -import org.broadinstitute.sting.utils.io.IOUtils -import scala.language.reflectiveCalls - -/** - * The base interface for all functions in Queue. - * Inputs and outputs are specified as Sets of values. - * Inputs are matched to other outputs by using .equals() - */ -trait QFunction extends Logging with QJobReport { - /** - * A short description of what this class of function does. - * By default does not include the output specific to this function. - * See shortDescription for a description of what this instance of the function outputs. - */ - var analysisName: String = "" - - /** - * The name name of the job, must be file system safe and unique to the graph. - * Defaults to "runName-". - * Use shortDescription for an alternative that is display friendly. - */ - var jobName: String = _ - - /** Default settings */ - var qSettings: QSettings = _ - - /** Directory to run the command in. */ - var commandDirectory: File = new File(".") - - /** Temporary directory to write any files. Must be network accessible. */ - var jobTempDir: File = null - - /** - * Local path available on all machines to store LOCAL temporary files. Not an @Input, - * nor an @Output. Currently only used for local intermediate files for composite jobs. - * Needs to be an annotated field so that it's mutated during cloning. - */ - @Argument(doc="Local path available on all machines to store LOCAL temporary files.") - var jobLocalDir: File = _ - - /** Order the function was added to the graph. */ - var addOrder: Seq[Int] = Nil - - /** Job priority */ - var jobPriority: Option[Int] = None - - /** Whether a job is restartable */ - var jobRestartable = true - - /** - * A callback for modifying the run. - * NOTE: This function is for ADVANCED use only and is unsupported. - */ - var updateJobRun: PartialFunction[Any,Unit] = null - - /** - * If true, unless another unfinished function is dependent on this function, - * this function will NOT be run even if the outputs have not been created. - */ - var isIntermediate = false - - // ------------------------------------------------------- - // - // job run information - // - // ------------------------------------------------------- - - /** - * Copies settings from this function to another function. - * @param function QFunction to copy values to. - */ - override def copySettingsTo(function: QFunction) { - function.qSettings = this.qSettings - function.commandDirectory = this.commandDirectory - function.jobTempDir = this.jobTempDir - function.jobLocalDir = this.jobLocalDir - function.addOrder = this.addOrder - function.jobPriority = this.jobPriority - function.jobRestartable = this.jobRestartable - function.updateJobRun = this.updateJobRun - function.isIntermediate = this.isIntermediate - function.reportGroup = this.reportGroup - function.reportFeatures = this.reportFeatures - } - - /** File to redirect any output. Defaults to .out */ - var jobOutputFile: File = _ - - /** File to redirect any errors. Defaults to .out */ - var jobErrorFile: File = _ - - /** Errors (if any) from the last failed run of jobErrorFiles. */ - @Argument(doc="Job error lines", required=false) - var jobErrorLines: Seq[String] = Nil - - /** - * The number of times this function has previously been run. - */ - @Argument(doc="Job retries", required=false) - var retries = 0 - - /** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */ - def setupRetry() { - } - - /** - * Description of this command line function. - */ - def description: String = "%s: %s > %s".format(analysisName, inputs, outputs) - - /** - * A short description of the function. - */ - def shortDescription = { - firstOutput match { - case file: File => analysisName + ": " + file.getName - case _ => analysisName - } - } - - /** - * The name of the job as submitted to the job runner - */ - def jobRunnerJobName = shortDescription - - /** - * Returns true if the function is done. - */ - def isDone: Boolean = { - val files = doneOutputs - if (files.size == 0) - throw new IllegalStateException("Function should have at least one output: " + analysisName) - files.forall(_.exists) - } - - /** - * Returns true if the function has failed. - */ - def isFail: Boolean = { - val files = failOutputs - if (files.size == 0) - throw new IllegalStateException("Function should have at least one output: " + analysisName) - files.exists(_.exists) - } - - /** - * Returns files to track for hidden done/fail files. - * @return Seq[String] files. - */ - protected def statusPaths = { - var paths = outputs - paths :+= jobOutputFile - if (jobErrorFile != null) - paths :+= jobErrorFile - paths - } - - /** - * Returns prefixes for hidden done/fail files. - * @return prefixes. - */ - private def statusPrefixes = statusPaths. - filter(file => !IOUtils.isSpecialFile(file)). - map(file => file.getParentFile + "/." + file.getName) - - /** - * Returns the output files for this function. - * @return outputs for this function. - */ - def doneOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".done")) - - /** - * Returns the output files for this function. - * @return outputs for this function. - */ - def failOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".fail")) - - /** The complete list of fields on this CommandLineFunction. */ - def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.functionFieldClass) - /** The @Input fields on this CommandLineFunction. */ - def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.functionFieldClass) - /** The @Output fields on this CommandLineFunction. */ - def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.functionFieldClass) - /** The @Argument fields on this CommandLineFunction. */ - def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.functionFieldClass) - - /** - * Returns the class that should be used for looking up fields. - */ - protected def functionFieldClass = this.getClass - - /** - * Returns the input files for this function. - * @return inputs for this function. - */ - def inputs: Seq[File] = getFieldFiles(inputFields) - - /** - * Returns the output files for this function. - * @return outputs for this function. - */ - def outputs: Seq[File] = getFieldFiles(outputFields) - - /** - * Returns the first output file. - * @return first output for this function. - */ - def firstOutput: File = outputs.headOption.getOrElse(null) - - /** - * Returns the set of directories where files may be written. - */ - def outputDirectories = { - var dirs = Set.empty[File] - dirs += commandDirectory - dirs += jobTempDir - dirs += jobLocalDir - dirs += jobOutputFile.getParentFile - if (jobErrorFile != null) - dirs += jobErrorFile.getParentFile - dirs ++= outputs.map(_.getParentFile) - dirs - } - - /** - * Deletes the log files for this function. - */ - def deleteLogs() = { - IOUtils.tryDelete(jobOutputFile) - if (jobErrorFile != null) - IOUtils.tryDelete(jobErrorFile) - } - - /** - * Deletes the output files and all the status files for this function. - */ - def deleteOutputs() { - outputs.filter(file => !IOUtils.isSpecialFile(file)).foreach(file => IOUtils.tryDelete(file)) - doneOutputs.foreach(file => IOUtils.tryDelete(file)) - failOutputs.foreach(file => IOUtils.tryDelete(file)) - } - - /** - * Creates the output directories for this function if it doesn't exist. - */ - def mkOutputDirectories() { - outputDirectories.foreach(dir => { - if (!dir.exists && !dir.mkdirs) - throw new QException("Unable to create directory: " + dir) - }) - } - - /** - * Returns fields that do not have values which are required. - * @return Seq[String] names of fields missing values. - */ - def missingFields: Seq[String] = { - val missingInputs = missingFields(inputFields, classOf[Input]) - val missingOutputs = missingFields(outputFields, classOf[Output]) - val missingArguments = missingFields(argumentFields, classOf[Argument]) - (missingInputs ++ missingOutputs ++ missingArguments).distinct.sorted - } - - /** - * Returns fields that do not have values which are required. - * @param sources Fields to check. - * @param annotation Annotation. - * @return names of fields missing values. - */ - private def missingFields(sources: Seq[ArgumentSource], annotation: Class[_ <: Annotation]): Seq[String] = { - var missing: Seq[String] = Nil - for (source <- sources) { - if (isRequired(source, annotation)) - if (!hasFieldValue(source)) - if (!exclusiveOf(source, annotation).exists(otherSource => hasFieldValue(otherSource))) - missing :+= "@%s: %s - %s".format(annotation.getSimpleName, source.field.getName, doc(source, annotation)) - } - missing - } - - /** - * Gets the files from the fields. The fields must be a File, a FileExtension, or a Seq or Set of either. - * @param fields Fields to get files. - * @return for the fields. - */ - private def getFieldFiles(fields: Seq[ArgumentSource]): Seq[File] = { - var files: Seq[File] = Nil - for (field <- fields) - files ++= getFieldFiles(field) - files.distinct - } - - /** - * Gets the files from the field. The field must be a File, a FileExtension, or a Seq or Set of either. - * @param field Field to get files. - * @return for the field. - */ - def getFieldFiles(field: ArgumentSource): Seq[File] = { - var files: Seq[File] = Nil - CollectionUtils.foreach(getFieldValue(field), (fieldValue) => { - val file = fieldValueToFile(field, fieldValue) - if (file != null) - files :+= file - }) - files.distinct - } - - /** - * Gets the file from the field. The field must be a File or a FileExtension and not a Seq or Set. - * @param field Field to get the file. - * @return for the field. - */ - def getFieldFile(field: ArgumentSource): File = - fieldValueToFile(field, getFieldValue(field)) - - /** - * Converts the field value to a file. The field must be a File or a FileExtension. - * @param field Field to get the file. - * @param value Value of the File or FileExtension or null. - * @return Null if value is null, otherwise the File. - * @throws QException if the value is not a File or FileExtension. - */ - private def fieldValueToFile(field: ArgumentSource, value: Any): File = value match { - case file: File => file - case null => null - case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown)) - } - - /** - * After a function is frozen no more updates are allowed by the user. - * The function is allow to make necessary updates internally to make sure - * the inputs and outputs will be equal to other inputs and outputs. - */ - final def freeze() { - freezeFieldValues() - canonFieldValues() - } - - /** - * Sets all field values. - */ - def freezeFieldValues() { - if (jobName == null) - jobName = qSettings.runName + "-" + this.addOrder.mkString("-") - - if (jobOutputFile == null) { - jobOutputFile = firstOutput match { - case file: File if (!IOUtils.isSpecialFile(file)) => new File(file.getParentFile, file.getName + ".out") - case _ => new File(jobName + ".out") - } - } - - if (jobTempDir == null) - jobTempDir = qSettings.tempDirectory - - if (jobLocalDir == null) - jobLocalDir = jobTempDir - - if (jobPriority.isEmpty) - jobPriority = qSettings.jobPriority - - // Do not set the temp and local dir relative to the command directory - jobTempDir = IOUtils.absolute(jobTempDir) - jobLocalDir = IOUtils.absolute(jobLocalDir) - - absoluteCommandDirectory() - } - - /** - * If the command directory is relative, insert the run directory ahead of it. - */ - def absoluteCommandDirectory() { - commandDirectory = IOUtils.absolute(qSettings.runDirectory, commandDirectory) - } - - /** - * Makes all field values canonical so that the graph can match the - * inputs of one function to the output of another using equals(). - */ - def canonFieldValues() { - for (field <- this.functionFields) { - var fieldValue = this.getFieldValue(field) - fieldValue = CollectionUtils.updated(fieldValue, canon).asInstanceOf[AnyRef] - this.setFieldValue(field, fieldValue) - } - - this.jobOutputFile = canon(this.jobOutputFile).asInstanceOf[File] - if (this.jobErrorFile != null) - this.jobErrorFile = canon(this.jobErrorFile).asInstanceOf[File] - } - - /** - * Set value to a uniform value across functions. - * Base implementation changes any relative path to an absolute path. - * @param value to be updated - * @return the modified value, or a copy if the value is immutable - */ - protected def canon(value: Any) = { - value match { - case file: File => IOUtils.absolute(commandDirectory, file) - case x => x - } - } - - /** - * Scala sugar type for checking annotation required and exclusiveOf. - */ - private type ArgumentAnnotation = { - def required(): Boolean - def exclusiveOf(): String - def doc(): String - } - - /** - * Returns the isRequired value from the field. - * @param field Field to check. - * @param annotation Annotation. - * @return the isRequired value from the field annotation. - */ - private def isRequired(field: ArgumentSource, annotation: Class[_ <: Annotation]) = - ReflectionUtils.getAnnotation(field.field, annotation).asInstanceOf[ArgumentAnnotation].required() - - /** - * Returns an array of ArgumentSources from functionFields listed in the exclusiveOf of the original field - * @param field Field to check. - * @param annotation Annotation. - * @return the Array[ArgumentSource] that may be set instead of the field. - */ - private def exclusiveOf(field: ArgumentSource, annotation: Class[_ <: Annotation]) = - ReflectionUtils.getAnnotation(field.field, annotation).asInstanceOf[ArgumentAnnotation].exclusiveOf() - .split(",").map(_.trim).filter(_.length > 0) - .map(fieldName => functionFields.find(fieldName == _.field.getName) match { - case Some(x) => x - case None => throw new QException("Unable to find exclusion field %s on %s".format(fieldName, this.getClass.getSimpleName)) - }) - - /** - * Returns the doc value from the field. - * @param field Field to check. - * @param annotation Annotation. - * @return the doc value from the field annotation. - */ - private def doc(field: ArgumentSource, annotation: Class[_ <: Annotation]) = - ReflectionUtils.getAnnotation(field.field, annotation).asInstanceOf[ArgumentAnnotation].doc() - - /** - * Returns true if the field has a value. - * @param source Field to check for a value. - * @return true if the field has a value. - */ - protected def hasFieldValue(source: ArgumentSource) = this.hasValue(this.getFieldValue(source)) - - /** - * Returns false if the value is null or an empty collection. - * @param param Value to test for null, or a collection to test if it is empty. - * @return false if the value is null, or false if the collection is empty, otherwise true. - */ - protected def hasValue(param: Any) = CollectionUtils.isNotNullOrNotEmpty(param) - - /** - * Gets the value of a field. - * @param source Field to get the value for. - * @return value of the field. - */ - def getFieldValue(source: ArgumentSource) = ClassFieldCache.getFieldValue(this, source) - - /** - * Gets the value of a field. - * @param source Field to set the value for. - * @return value of the field. - */ - def setFieldValue(source: ArgumentSource, value: Any) = ClassFieldCache.setFieldValue(this, source, value) -} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala deleted file mode 100644 index 251b1c511..000000000 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ /dev/null @@ -1,231 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.pipeline - -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.Utils -import org.testng.Assert -import org.broadinstitute.sting.commandline.CommandLineProgram -import java.util.Date -import java.text.SimpleDateFormat -import org.broadinstitute.sting.BaseTest -import org.broadinstitute.sting.MD5DB -import org.broadinstitute.sting.queue.QCommandLine -import org.broadinstitute.sting.queue.util.Logging -import java.io.File -import org.broadinstitute.sting.gatk.report.GATKReport -import org.apache.commons.io.FileUtils -import org.broadinstitute.sting.queue.engine.CommandLinePluginManager - -object PipelineTest extends BaseTest with Logging { - - private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/" - private val md5DB = new MD5DB() - - /** - * All the job runners configured to run PipelineTests at The Broad. - */ - final val allJobRunners = Seq("Lsf706", "GridEngine", "Shell") - - /** - * The default job runners to run. - */ - final val defaultJobRunners = Seq("Lsf706", "GridEngine") - - /** - * Returns the top level output path to this test. - * @param testName The name of the test passed to PipelineTest.executeTest() - * @param jobRunner The name of the job manager to run the jobs. - * @return the top level output path to this test. - */ - def testDir(testName: String, jobRunner: String) = "pipelinetests/%s/%s/".format(testName, jobRunner) - - /** - * Returns the directory where relative output files will be written for this test. - * @param testName The name of the test passed to PipelineTest.executeTest() - * @param jobRunner The name of the job manager to run the jobs. - * @return the directory where relative output files will be written for this test. - */ - private def runDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "run/" - - /** - * Returns the directory where temp files will be written for this test. - * @param testName The name of the test passed to PipelineTest.executeTest() - * @param jobRunner The name of the job manager to run the jobs. - * @return the directory where temp files will be written for this test. - */ - private def tempDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "temp/" - - /** - * Runs the pipelineTest. - * @param pipelineTest test to run. - */ - def executeTest(pipelineTest: PipelineTestSpec) { - var jobRunners = pipelineTest.jobRunners - if (jobRunners == null) - jobRunners = defaultJobRunners; - jobRunners.foreach(executeTest(pipelineTest, _)) - } - - /** - * Runs the pipelineTest. - * @param pipelineTest test to run. - * @param jobRunner The name of the job manager to run the jobs. - */ - def executeTest(pipelineTest: PipelineTestSpec, jobRunner: String) { - val name = pipelineTest.name - if (name == null) - Assert.fail("PipelineTestSpec.name is null") - println(Utils.dupString('-', 80)); - executeTest(name, pipelineTest.args, pipelineTest.jobQueue, pipelineTest.expectedException, jobRunner) - if (BaseTest.pipelineTestRunModeIsSet) { - assertMatchingMD5s(name, pipelineTest.fileMD5s.map{case (file, md5) => new File(runDir(name, jobRunner), file) -> md5}, pipelineTest.parameterize) - if (pipelineTest.evalSpec != null) - validateEval(name, pipelineTest.evalSpec, jobRunner) - println(" => %s PASSED (%s)".format(name, jobRunner)) - } - else - println(" => %s PASSED DRY RUN (%s)".format(name, jobRunner)) - } - - private def assertMatchingMD5s(name: String, fileMD5s: Traversable[(File, String)], parameterize: Boolean) { - var failed = 0 - for ((file, expectedMD5) <- fileMD5s) { - val calculatedMD5 = md5DB.testFileMD5(name, "", file, expectedMD5, parameterize).actualMD5 - if (!parameterize && expectedMD5 != "" && expectedMD5 != calculatedMD5) - failed += 1 - } - if (failed > 0) - Assert.fail("%d of %d MD5s did not match".format(failed, fileMD5s.size)) - } - - private def validateEval(name: String, evalSpec: PipelineTestEvalSpec, jobRunner: String) { - // write the report to the shared validation data location - val formatter = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss") - val reportLocation = "%s%s/%s/validation.%s.eval".format(validationReportsDataLocation, jobRunner, name, formatter.format(new Date)) - val reportFile = new File(reportLocation) - - FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), reportFile); - - val report = new GATKReport(reportFile); - - var allInRange = true - - println() - println(name + " validation values:") - println(" value (min,target,max) table key metric") - for (validation <- evalSpec.validations) { - val table = report.getTable(validation.table) - val key = table.findRowByData(validation.table +: validation.key.split('.') : _*) - val value = String.valueOf(table.get(key, validation.metric)) - val inRange = if (value == null) false else validation.inRange(value) - val flag = if (!inRange) "*" else " " - println(" %s %s (%s,%s,%s) %s %s %s".format(flag, value, validation.min, validation.target, validation.max, validation.table, validation.key, validation.metric)) - allInRange &= inRange - } - - if (!allInRange) - Assert.fail("Eval outside of expected range") - } - - /** - * execute the test - * @param name the name of the test - * @param args the argument list - * @param jobQueue the queue to run the job on. Defaults to hour if jobQueue is null. - * @param expectedException the expected exception or null if no exception is expected. - * @param jobRunner The name of the job manager to run the jobs. - */ - private def executeTest(name: String, args: String, jobQueue: String, expectedException: Class[_], jobRunner: String) { - var command = Utils.escapeExpressions(args) - - // add the logging level to each of the integration test commands - - command = Utils.appendArray(command, "-jobRunner", jobRunner, - "-tempDir", tempDir(name, jobRunner), "-runDir", runDir(name, jobRunner)) - - if (jobQueue != null) - command = Utils.appendArray(command, "-jobQueue", jobQueue) - - if (BaseTest.pipelineTestRunModeIsSet) - command = Utils.appendArray(command, "-run") - - // run the executable - var gotAnException = false - - val instance = new QCommandLine - runningCommandLines += instance - try { - println("Executing test %s with Queue arguments: %s".format(name, Utils.join(" ",command))) - CommandLineProgram.start(instance, command) - } catch { - case e: Exception => - gotAnException = true - if (expectedException != null) { - // we expect an exception - println("Wanted exception %s, saw %s".format(expectedException, e.getClass)) - if (expectedException.isInstance(e)) { - // it's the type we expected - println(String.format(" => %s PASSED (%s)", name, jobRunner)) - } else { - e.printStackTrace() - Assert.fail("Test %s expected exception %s but got %s instead (%s)".format( - name, expectedException, e.getClass, jobRunner)) - } - } else { - // we didn't expect an exception but we got one :-( - throw new RuntimeException(e) - } - } finally { - instance.shutdown() - runningCommandLines -= instance - } - - // catch failures from the integration test - if (expectedException != null) { - if (!gotAnException) - // we expected an exception but didn't see it - Assert.fail("Test %s expected exception %s but none was thrown (%s)".format(name, expectedException.toString, jobRunner)) - } else { - if (CommandLineProgram.result != 0) - throw new RuntimeException("Error running Queue with arguments: " + args) - } - } - - private var runningCommandLines = Set.empty[QCommandLine] - - Runtime.getRuntime.addShutdownHook(new Thread { - /** Cleanup as the JVM shuts down. */ - override def run() { - runningCommandLines.foreach(commandLine => - try { - commandLine.shutdown() - } catch { - case _: Throwable => /* ignore */ - }) - } - }) -} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala deleted file mode 100644 index a27af17be..000000000 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.pipeline - -class PipelineTestSpec(var name: String = null) { - - /** The arguments to pass to the Queue test, ex: "-S scala/qscript/examples/HelloWorld.scala" */ - var args: String = _ - - /** Job Queue to run the test. Default is null which means use hour. */ - var jobQueue: String = _ - - /** Job runners to run the test. Default is null which means use the default. */ - var jobRunners: Seq[String] = _ - - /** Expected MD5 results for each file path. */ - var fileMD5s = Map.empty[String, String] - - /** VariantEval validations to run on a VCF after the pipeline has completed. */ - var evalSpec: PipelineTestEvalSpec = _ - - /** Expected exception from the test. */ - var expectedException: Class[_ <: Exception] = null - - /** If true will check the MD5s without failing. */ - var parameterize = false - - def this(args: String, fileMD5s: Traversable[(String, String)]) = { - this() - this.args = args - this.fileMD5s = fileMD5s.toMap - } - - def this(args: String, expectedException: Class[_ <: Exception]) = { - this() - this.args = args - this.expectedException = expectedException - } -} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala deleted file mode 100644 index 10c3245e5..000000000 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala +++ /dev/null @@ -1,68 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.pipeline.examples - -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import org.testng.annotations.Test -import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} -import org.broadinstitute.sting.BaseTest - -class DevNullOutputPipelineTest { - @Test(timeOut=36000000) - def testDevNullOutput() { - val spec = new PipelineTestSpec - spec.name = "devnulloutput" - spec.args = Array( - " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala", - " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", - " -I " + BaseTest.publicTestDir + "exampleBAM.bam").mkString - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } -} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala deleted file mode 100644 index 83e50c9c5..000000000 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ /dev/null @@ -1,141 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.pipeline.examples - -import org.testng.annotations.Test -import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} - -class HelloWorldPipelineTest { - @Test(timeOut=36000000) - def testHelloWorld() { - val spec = new PipelineTestSpec - spec.name = "HelloWorld" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithRunName() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithRunName" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -runName HelloWorld" - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithMemoryLimit() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldMemoryLimit" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -memLimit 1.25" - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithPriority() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithPriority" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -jobPriority 100" - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithLsfResource() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithLsfResource" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" - spec.jobRunners = Seq("Lsf706") - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithLsfResourceAndMemoryLimit() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -memLimit 1.25 -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" - spec.jobRunners = Seq("Lsf706") - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithLsfEnvironment() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithLsfEnvironment" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -jobEnv tv" - spec.jobRunners = Seq("Lsf706") - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithGridEngineResource() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithGridEngineResource" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -jobResReq s_core=1000M" - spec.jobRunners = Seq("GridEngine") - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithGridEngineResourceAndMemoryLimit() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -memLimit 1.25 -jobResReq s_core=1000M" - spec.jobRunners = Seq("GridEngine") - PipelineTest.executeTest(spec) - } - - @Test(timeOut=36000000) - def testHelloWorldWithGridEngineEnvironment() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithGridEngineEnvironment" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -jobEnv \"make 1\"" - spec.jobRunners = Seq("GridEngine") - PipelineTest.executeTest(spec) - } - - // disabled because our DRMAA implementation doesn't support wallTime - @Test(enabled=false, timeOut=36000000) - def testHelloWorldWithWalltime() { - val spec = new PipelineTestSpec - spec.name = "HelloWorldWithWalltime" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -wallTime 100" - spec.jobRunners = PipelineTest.allJobRunners - PipelineTest.executeTest(spec) - } -} diff --git a/public/sting-root/pom.xml b/public/sting-root/pom.xml new file mode 100644 index 000000000..00cecf981 --- /dev/null +++ b/public/sting-root/pom.xml @@ -0,0 +1,622 @@ + + + 4.0.0 + + + + org.broadinstitute.sting + sting-root + 3.0 + pom + Sting Root + + + 3.0.4 + + + + UTF-8 + ${sourceEncoding} + ${sourceEncoding} + 1.7 + 1.7 + yyyy/MM/dd HH:mm:ss + ${project.basedir}/../.. + true + ${sting.committests.skipped} + ${sting.committests.skipped} + ${sting.committests.skipped} + true + true + false + 1g + 4g + 4 + 50 + 10 + -Xmx${test.maxmemory} -XX:+UseParallelOldGC -XX:ParallelGCThreads=${java.gc.threads} -XX:GCTimeLimit=${java.gc.timeLimit} -XX:GCHeapFreeLimit=${java.gc.heapFreeLimit} + + + 1.107.1683 + ${picard.public.version} + ${picard.public.version} + ${picard.public.version} + ${picard.public.version} + + + + + + + org.scala-lang + scala-compiler + 2.10.2 + + + org.scala-lang + scala-library + 2.10.2 + + + com.google.code.cofoja + cofoja + 1.0-r139 + + + net.sf + sam + ${sam.version} + + + org.testng + testng + + + + + net.sf + picard + ${picard.version} + + + org.broad + tribble + ${tribble.version} + + + org.broadinstitute + variant + ${variant.version} + + + log4j + log4j + 1.2.15 + + + com.sun.jdmk + jmxtools + + + javax.jms + jms + + + com.sun.jmx + jmxri + + + + + javax.mail + mail + 1.4.4 + + + colt + colt + 1.2.0 + + + it.unimi.dsi + fastutil + 6.5.3 + + + org.simpleframework + simple-xml + 2.0.4 + + + org.reflections + reflections + 0.9.8 + + + org.slf4j + slf4j-log4j12 + 1.6.1 + + + gov.nist.math + jama + 1.0.2 + + + net.sf.jgrapht + jgrapht + 0.8.3 + + + org.freemarker + freemarker + 2.3.18 + + + org.apache.commons + commons-email + 1.2 + + + org.apache.commons + commons-jexl + 2.1.1 + + + commons-lang + commons-lang + 2.5 + + + commons-logging + commons-logging + 1.1.1 + + + commons-io + commons-io + 2.1 + + + commons-collections + commons-collections + 3.2.1 + + + org.apache.commons + commons-math + 2.2 + + + net.java.dev.jna + jna + 3.2.7 + + + net.java.dev.jets3t + jets3t + 0.8.1 + + + us.levk + drmaa-gridengine + 6.2u5 + + + net.sf.snpeff + snpeff + 2.0.5 + + + org.mongodb + mongo-java-driver + 2.7.3 + + + com.google.code.gson + gson + 2.2.2 + + + org.apache.httpcomponents + httpclient + 4.1.1 + + + + + com.sun + tools + 1.4.2 + system + ${java.home}/../lib/tools.jar + + + + org.testng + testng + 6.8 + test + + + com.google.caliper + caliper + 0.5-rc1 + test + + + + com.google.guava + guava + + + + + + + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.2.1 + + + com.lukegb.mojo + gitdescribe-maven-plugin + 2.0 + + + org.codehaus.mojo + build-helper-maven-plugin + 1.8 + + + org.apache.maven.plugins + maven-clean-plugin + 2.5 + + + org.apache.maven.plugins + maven-dependency-plugin + 2.8 + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + + -Xmx${scala.maxmemory} + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + org.apache.maven.plugins + maven-shade-plugin + 2.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 2.4 + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.16 + + + true + false + false + plain + ${test.args} + + + usedefaultlisteners + false + + + + diff + true + ${java.io.tmpdir} + + + + + default-test + none + + + + unit-tests + + test + + + + ${sting.unittests.skipped} + ${project.build.directory}/surefire-reports/unit/${test} + + **/*UnitTest.class + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.16 + + + true + false + false + plain + ${test.args} + + + usedefaultlisteners + false + + + + diff + true + + ${sting.pipelinetests.run} + ${java.io.tmpdir} + + + + + integration-tests + + integration-test + verify + + + + + ${sting.integrationtests.skipped} + ${project.build.directory}/failsafe-reports/integration/${it.test} + ${project.build.directory}/failsafe-reports/integration/failsafe-summary-${it.test}.xml + + **/*IntegrationTest.class + + + + + pipeline-tests + + integration-test + verify + + + + + ${sting.pipelinetests.skipped} + ${project.build.directory}/failsafe-reports/pipeline/${it.test} + ${project.build.directory}/failsafe-reports/pipeline/failsafe-summary-${it.test}.xml + + **/*PipelineTest.class + + + + + large-scale-tests + + integration-test + verify + + + + ${sting.largescaletests.skipped} + ${project.build.directory}/failsafe-reports/largescale/${it.test} + ${project.build.directory}/failsafe-reports/largescale/failsafe-summary-${it.test}.xml + + **/*LargeScaleTest.class + + + + + knowledge-base-tests + + integration-test + verify + + + + ${sting.knowledgebasetests.skipped} + ${project.build.directory}/failsafe-reports/knowledgebasetests/${it.test} + ${project.build.directory}/failsafe-reports/knowledgebasetests/failsafe-summary-${it.test}.xml + + **/*KnowledgeBaseTest.class + + + + + + + + com.google.code.sortpom + maven-sortpom-plugin + 2.2 + + false + custom_1 + \n + ${sourceEncoding} + true + scope + 4 + false + + + + + sort + + verify + + + + + + + com.pyx4j + maven-junction-plugin + 1.0.3 + + + org.apache.maven.plugins + maven-invoker-plugin + 1.8 + + + org.apache.maven.plugins + maven-install-plugin + 2.5 + + + org.apache.maven.plugins + maven-site-plugin + 3.3 + + + + + + + + com.lukegb.mojo + gitdescribe-maven-plugin + + + --long + + true + git.version + exported + + + + gitdescribe-initialize + + gitdescribe + + initialize + + + gitdescribe-presite + + gitdescribe + + pre-site + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + + fix-version-initialize + + regex-property + + initialize + + build.version + ${git.version} + git- + + + + fix-version-pre-site + + regex-property + + pre-site + + build.version + ${git.version} + git- + + + + + + com.google.code.sortpom + maven-sortpom-plugin + + + default + + sort + + verify + + + + + + + + true + + + + + sting.public.repo.local + Sting Public Local Repository + file:${sting.basedir}/public/repo + + + + diff --git a/public/sting-utils/pom.xml b/public/sting-utils/pom.xml new file mode 100644 index 000000000..6abf98515 --- /dev/null +++ b/public/sting-utils/pom.xml @@ -0,0 +1,159 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 3.0 + ../.. + + + sting-utils + jar + Sting Utils + + + ${project.basedir}/../.. + org/broadinstitute/sting/utils/R + gsalib.tar.gz + + + + + net.sf + sam + + + net.sf + picard + + + org.broad + tribble + + + org.broadinstitute + variant + + + log4j + log4j + + + colt + colt + + + it.unimi.dsi + fastutil + + + org.simpleframework + simple-xml + + + org.reflections + reflections + + + org.slf4j + slf4j-log4j12 + + + org.freemarker + freemarker + + + org.apache.commons + commons-jexl + + + commons-lang + commons-lang + + + commons-io + commons-io + + + commons-collections + commons-collections + + + org.apache.commons + commons-math + + + net.java.dev.jna + jna + + + net.java.dev.jets3t + jets3t + + + us.levk + drmaa-gridengine + + + com.google.code.gson + gson + + + org.apache.httpcomponents + httpclient + + + com.google.code.cofoja + cofoja + + + ${project.groupId} + gsalib + ${project.version} + tar.gz + + + + org.testng + testng + test + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-gsalib + + copy + + process-resources + + + + ${project.groupId} + gsalib + ${project.version} + tar.gz + ${project.build.outputDirectory}/${gsalib.packagedir} + ${gsalib.filename} + + + + + + + + + diff --git a/public/sting-utils/src/main/config/org/broadinstitute/sting/utils/help/log4j.properties b/public/sting-utils/src/main/config/org/broadinstitute/sting/utils/help/log4j.properties new file mode 100644 index 000000000..38c8335c9 --- /dev/null +++ b/public/sting-utils/src/main/config/org/broadinstitute/sting/utils/help/log4j.properties @@ -0,0 +1,7 @@ +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html index f4fb74af1..ff9df5eea 100644 --- a/settings/helpTemplates/common.html +++ b/settings/helpTemplates/common.html @@ -86,7 +86,13 @@ Support Forum

-

GATK version ${version} built at ${timestamp}.

+

GATK version ${version} built at ${timestamp}. + <#-- closing P tag in next macro --> + + + <#macro footerClose> + <#-- ugly little hack to enable adding tool-specific info inline --> +

<#macro pageFooter> diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html index a5650d55e..0398b829d 100644 --- a/settings/helpTemplates/generic.index.template.html +++ b/settings/helpTemplates/generic.index.template.html @@ -58,7 +58,7 @@ ${version}
- <#assign seq = ["engine", "tools", "utilities", "other"]> + <#assign seq = ["engine", "tools", "utilities", "other", "dev"]> <#list seq as supercat>
<#list groups?sort_by("name") as group> @@ -70,4 +70,5 @@
<@footerInfo /> +<@footerClose /> <@pageFooter /> diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index eea741669..d4aa7c7f9 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -31,45 +31,70 @@ <#list myargs as arg> - ${arg.name} - ${arg.type} + ${arg.name}
+ <#if arg.synonyms??> + <#if arg.name[2..] != arg.synonyms[1..]> +  ${arg.synonyms} + + + + ${arg.defaultValue!"NA"} ${arg.summary} - <#-- - < - td>${arg.required} - --> <#macro argumentDetails arg> -

${arg.name} - <#if arg.synonyms??> / ${arg.synonyms} - - ( - <#if arg.attributes??>${arg.attributes} - ${arg.type} - <#if arg.defaultValue??> with default value ${arg.defaultValue} - ) -

-

- ${arg.summary}. ${arg.fulltext} - <#if arg.rodTypes??>${arg.name} binds reference ordered data. This argument supports ROD files of the - following types: ${arg.rodTypes} - - <#if arg.options??> -
- The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values: -

- <#list arg.options as option> -
${option.name}
-
${option.summary}
- -
- -

+
+

${arg.name} + <#if arg.synonyms??> / ${arg.synonyms} +

+

+ ${arg.summary}
+ ${arg.fulltext} +

+ + + <#if arg.rodTypes??> +

${arg.name} binds reference ordered data. This argument supports ROD files of the following types: ${arg.rodTypes}

+ + <#if arg.options??> +

+ The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values: +

+ <#list arg.options as option> +
${option.name}
+
${option.summary}
+ +
+

+ +

<#if arg.required??> + <#if arg.required == "yes"> + R + + + ${arg.type} + <#if arg.defaultValue??> +  ${arg.defaultValue} + + <#if arg.minValue??> +  [ [ ${arg.minValue} + + <#if arg.minRecValue??> +  [ ${arg.minRecValue} + + <#if arg.maxRecValue??> +  ${arg.maxRecValue} ] + + <#if arg.maxValue??> +  ${arg.maxValue} ] ] + +

<#macro relatedByType name type> <#list relatedDocs as relatedDoc> @@ -103,11 +128,12 @@

${name}

${summary}

+ <#-- using goto dev annotation instead, see above footer <#if author??>

Author ${author}

- + --> <#if group?? >

Category ${group} @@ -229,12 +255,12 @@ <#-- Create the argument summary --> <#if arguments.all?size != 0>

${name} specific arguments

-

This table summarizes the command-line arguments that are specific to this tool. For details, see the list further down below the table.

+

This table summarizes the command-line arguments that are specific to this tool. For more details on each argument, see the list further down below the table or click on an argument name to jump directly to that entry in the list.

- - + + @@ -267,6 +293,11 @@ <@argumentDetails arg=arg/> - + <@footerInfo /> + <#-- Specify go-to developer (for internal use) --> + <#if gotoDev??> + GTD: ${gotoDev} + + <@footerClose /> <@pageFooter /> \ No newline at end of file diff --git a/settings/ivysettings.properties b/settings/ivysettings.properties deleted file mode 100644 index 8a67434f9..000000000 --- a/settings/ivysettings.properties +++ /dev/null @@ -1 +0,0 @@ -repository.dir=${ivy.settings.dir}/repository diff --git a/settings/ivysettings.xml b/settings/ivysettings.xml deleted file mode 100644 index ce7667140..000000000 --- a/settings/ivysettings.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.jar b/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.jar deleted file mode 100644 index 3f7007457..000000000 Binary files a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.jar and /dev/null differ diff --git a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.xml b/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.xml deleted file mode 100644 index 0e2b727e1..000000000 --- a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.xml b/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.xml deleted file mode 100644 index 202d3d0a3..000000000 --- a/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2872.jar b/settings/repository/edu.mit.broad/picard-private-parts-2872.jar deleted file mode 100644 index b6e685684..000000000 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-2872.jar and /dev/null differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2872.xml b/settings/repository/edu.mit.broad/picard-private-parts-2872.xml deleted file mode 100644 index 677d27d80..000000000 --- a/settings/repository/edu.mit.broad/picard-private-parts-2872.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/gov.nist/Jama-1.0.2.jar b/settings/repository/gov.nist/Jama-1.0.2.jar deleted file mode 100644 index 824d13338..000000000 Binary files a/settings/repository/gov.nist/Jama-1.0.2.jar and /dev/null differ diff --git a/settings/repository/gov.nist/Jama-1.0.2.xml b/settings/repository/gov.nist/Jama-1.0.2.xml deleted file mode 100644 index 57ea106ea..000000000 --- a/settings/repository/gov.nist/Jama-1.0.2.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar deleted file mode 100644 index dc77c7d33..000000000 Binary files a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar and /dev/null differ diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar deleted file mode 100644 index f267be4b5..000000000 Binary files a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar and /dev/null differ diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml deleted file mode 100644 index c6a8da052..000000000 --- a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml deleted file mode 100644 index 9a622abe5..000000000 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.104.1628.jar b/settings/repository/net.sf/picard-1.104.1628.jar deleted file mode 100644 index 0a24f339e..000000000 Binary files a/settings/repository/net.sf/picard-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/net.sf/picard-1.104.1628.xml b/settings/repository/net.sf/picard-1.104.1628.xml deleted file mode 100644 index bd0c9fbcd..000000000 --- a/settings/repository/net.sf/picard-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.104.1628.jar b/settings/repository/net.sf/sam-1.104.1628.jar deleted file mode 100644 index ad8437d03..000000000 Binary files a/settings/repository/net.sf/sam-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/net.sf/sam-1.104.1628.xml b/settings/repository/net.sf/sam-1.104.1628.xml deleted file mode 100644 index 0b0cba93f..000000000 --- a/settings/repository/net.sf/sam-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broad/tribble-1.104.1628.jar b/settings/repository/org.broad/tribble-1.104.1628.jar deleted file mode 100644 index 134be3d94..000000000 Binary files a/settings/repository/org.broad/tribble-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/org.broad/tribble-1.104.1628.xml b/settings/repository/org.broad/tribble-1.104.1628.xml deleted file mode 100644 index c2158e974..000000000 --- a/settings/repository/org.broad/tribble-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broadinstitute/variant-1.104.1628.jar b/settings/repository/org.broadinstitute/variant-1.104.1628.jar deleted file mode 100644 index 59f4f10d7..000000000 Binary files a/settings/repository/org.broadinstitute/variant-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/org.broadinstitute/variant-1.104.1628.xml b/settings/repository/org.broadinstitute/variant-1.104.1628.xml deleted file mode 100644 index 527223bfb..000000000 --- a/settings/repository/org.broadinstitute/variant-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - -
NameTypeArgument name(s) Default value Summary