Merge remote-tracking branch 'unstable/master'

This commit is contained in:
David Roazen 2014-03-05 23:41:48 -05:00
commit 8fedaf541c
1767 changed files with 88621 additions and 85410 deletions

2
.gitignore vendored
View File

@ -24,3 +24,5 @@ dump/
lib/
out/
/atlassian-ide-plugin.xml
maven-metadata-local.xml
dependency-reduced-pom.xml

173
ant-bridge.sh 100755
View File

@ -0,0 +1,173 @@
#!/bin/sh
mvn_args="verify"
mvn_properties=
mvn_clean=
unknown_args=
property_regex='-D(.*)=(.*)'
unit_test_regex='.*UnitTest'
post_script=
run_type="run"
for arg in "${@}" ; do
if [[ "${arg}" == "dry" ]] ; then
run_type="dry"
elif [[ "${arg}" == "clean" ]] ; then
mvn_clean="clean"
mvn_args=
elif [[ "${arg}" =~ ${property_regex} ]] ; then
property_name=${BASH_REMATCH[1]}
property_value=${BASH_REMATCH[2]}
if [[ "${property_name}" == "single" ]] ; then
test_property="test"
test_disabled="it.test"
if [[ ! "${property_value}" =~ ${unit_test_regex} ]] ; then
test_property="it.test"
test_disabled="test"
fi
mvn_properties="${mvn_properties} -D${test_disabled}=disabled -D${test_property}=${property_value}"
elif [[ "${property_name}" == "test.debug.port" ]] ; then
mvn_properties="${mvn_properties} -Dmaven.surefire.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\""
mvn_properties="${mvn_properties} -Dmaven.failsafe.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\""
elif [[ "${property_name}" == "test.default.maxmemory" ]] ; then
mvn_properties="${mvn_properties} -Dtest.maxmemory=${property_value}"
else
unknown_args="${unknown_args} \"${arg}\""
fi
else
if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then
echo "Sorry, this script does not currently support mixing targets." >&2
exit 1
elif [[ "${arg}" == "dist" ]] ; then
mvn_args="verify"
elif [[ "${arg}" == "gatk" ]] ; then
mvn_args="verify '-P!queue'"
elif [[ "${arg}" == "test.compile" ]] ; then
mvn_args="test-compile"
elif [[ "${arg}" == "gatkdocs" ]] ; then
local_repo="sitetemprepo"
mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue"
elif [[ "${arg}" == "package.gatk.full" ]] ; then
mvn_args="package '-P!private,!queue'"
elif [[ "${arg}" == "package.gatk.all" ]] ; then
mvn_args="package '-P!queue'"
elif [[ "${arg}" == "package.queue.full" ]] ; then
mvn_args="package '-P!private'"
elif [[ "${arg}" == "package.queue.all" ]] ; then
mvn_args="package"
# elif [[ "${arg}" == "release.gatk.full" ]] ; then
# mvn_args="package '-P!private,!queue'"
# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2"
# elif [[ "${arg}" == "release.queue.full" ]] ; then
# mvn_args="package '-P!private'"
# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2"
elif [[ "${arg}" == "build-picard-private" ]] ; then
mvn_args="mvn install -f private/picard-maven/pom.xml"
# TODO: clover support
# see ant and maven docs for clover:
# https://confluence.atlassian.com/display/CLOVER/1.+QuickStart+Guide
# https://confluence.atlassian.com/display/CLOVER/Clover-for-Maven+2+and+3+User%27s+Guide
#
#elif [[ "${arg}" == "clover.report" ]] ; then
# mvn_args=...
#
#elif [[ "${arg}" == "with.clover" ]] ; then
# mvn_args=...
# TODO: This runs *all* commit tests, including the few on Queue.
elif [[ "${arg}" == "gatkfull.binary.release.tests" ]] ; then
local_repo="sitetemprepo"
mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify"
mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}"
mvn_args="${mvn_args} -Dsting.packagetests.enabled=true"
mvn_args="${mvn_args} -Dsting.packagecommittests.skipped=false"
# TODO: This runs only the pipeline tests (full, non-dry run), but not the commit tests for Queue.
elif [[ "${arg}" == "queuefull.binary.release.tests" ]] ; then
local_repo="sitetemprepo"
mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify"
mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}"
mvn_args="${mvn_args} -Dsting.packagetests.enabled=true"
mvn_args="${mvn_args} -Dsting.packagepipelinetests.skipped=false"
mvn_args="${mvn_args} -Dsting.pipelinetests.run=true"
elif [[ "${arg}" == "committests" ]] ; then
mvn_args="verify -Dsting.committests.skipped=false"
elif [[ "${arg}" == "test" ]] ; then
mvn_args="test -Dsting.unittests.skipped=false"
elif [[ "${arg}" == "unittest" ]] ; then
mvn_args="test -Dsting.unittests.skipped=false"
elif [[ "${arg}" == "integrationtest" ]] ; then
mvn_args="verify -Dsting.integrationtests.skipped=false"
elif [[ "${arg}" == "largescaletest" ]] ; then
mvn_args="verify -Dsting.largescaletests.skipped=false"
elif [[ "${arg}" == "knowledgebasetest" ]] ; then
mvn_args="verify -Dsting.knowledgebasetests.skipped=false"
elif [[ "${arg}" == "pipelinetest" ]] ; then
mvn_args="verify -Dsting.pipelinetests.skipped=false"
elif [[ "${arg}" == "pipelinetestrun" ]] ; then
mvn_args="verify -Dsting.pipelinetests.skipped=false -Dsting.pipelinetests.run=true"
elif [[ "${arg}" == "fasttest" ]] ; then
mvn_args="verify -Dsting.committests.skipped=false -pl private/gatk-private -am -Dresource.bundle.skip=true"
else
unknown_args="${unknown_args} \"${arg}\""
fi
fi
done
mvn_cmd=
if [[ "${mvn_clean}" != "" ]] ; then
if [[ "${mvn_args}" != "" ]] ; then
mvn_cmd="mvn ${mvn_clean} && mvn ${mvn_args}"
else
mvn_cmd="mvn ${mvn_clean}"
fi
else
mvn_cmd="mvn ${mvn_args}"
fi
if [[ "${unknown_args}" != "" ]] ; then
echo "Unrecognized arguments:${unknown_args}" >&2
else
echo "Equivalent maven command"
echo "${mvn_cmd}${mvn_properties}${post_script}"
if [[ "${run_type}" != "dry" ]] ; then
sh -c "${mvn_cmd}${mvn_properties}${post_script}"
fi
fi

1518
build.xml

File diff suppressed because it is too large Load Diff

117
ivy.xml
View File

@ -1,117 +0,0 @@
<!--
~ Copyright (c) 2012, The Broad Institute
~
~ Permission is hereby granted, free of charge, to any person
~ obtaining a copy of this software and associated documentation
~ files (the "Software"), to deal in the Software without
~ restriction, including without limitation the rights to use,
~ copy, modify, merge, publish, distribute, sublicense, and/or sell
~ copies of the Software, and to permit persons to whom the
~ Software is furnished to do so, subject to the following
~ conditions:
~
~ The above copyright notice and this permission notice shall be
~ included in all copies or substantial portions of the Software.
~ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
~ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
~ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
~ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
~ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
~ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
~ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
~ OTHER DEALINGS IN THE SOFTWARE.
-->
<ivy-module version="1.0">
<info organisation="org.broadinstitute" module="Sting"/>
<configurations>
<conf name="default" description="the core dependencies for the GATK"/>
</configurations>
<dependencies defaultconf="default">
<dependency org="net.sf" name="sam" rev="latest.integration"/>
<dependency org="net.sf" name="picard" rev="latest.integration"/>
<dependency org="edu.mit.broad" name="picard-private-parts" rev="latest.integration"/>
<!-- Tribble -->
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
<!-- Variant -->
<dependency org="org.broadinstitute" name="variant" rev="latest.integration"/>
<dependency org="log4j" name="log4j" rev="1.2.15"/>
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
<dependency org="colt" name="colt" rev="1.2.0"/>
<dependency org="it.unimi.dsi" name="fastutil" rev="6.5.3" />
<!-- <dependency org="jboss" name="javassist" rev="3.7.ga"/> -->
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
<dependency org="org.apache.bcel" name="bcel" rev="5.2"/>
<!-- Dependencies for reflections mvn repository -->
<dependency org="org.reflections" name="reflections" rev="0.9.8"/>
<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"/>
<!-- Matrix package from math.nist.gov -->
<dependency org="gov.nist" name="Jama" rev="1.0.2"/>
<!-- Dependencies for the graph aligner -->
<dependency org="net.sf.jgrapht" name="jgrapht" rev="0.8.3"/>
<!-- Dependencies for the html walker documention -->
<dependency org="org.freemarker" name="freemarker" rev="2.3.18"/>
<!-- Commons Dependencies -->
<dependency org="org.apache.commons" name="commons-email" rev="1.2"/>
<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1"/>
<dependency org="commons-lang" name="commons-lang" rev="2.5"/>
<dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
<dependency org="commons-io" name="commons-io" rev="2.1"/>
<dependency org="commons-collections" name="commons-collections" rev="3.2.1"/>
<dependency org="org.apache.commons" name="commons-math" rev="2.2"/>
<!-- Lucene core utilities -->
<!-- <dependency org="org.apache.lucene" name="lucene-core" rev="3.0.3"/> -->
<!-- Dependencies for LSF, DRMAA, and other C libraries -->
<dependency org="net.java.dev.jna" name="jna" rev="3.2.7"/>
<!-- Dependencies for amazon.com S3 support -->
<dependency org="net.java.dev.jets3t" name="jets3t" rev="0.8.1"/>
<!-- Dependencies for GridEngine -->
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
<!-- Scala dependancies -->
<dependency org="org.scala-lang" name="scala-compiler" rev="2.10.2"/>
<dependency org="org.scala-lang" name="scala-library" rev="2.10.2"/>
<!-- testing and evaluation dependencies -->
<dependency org="org.testng" name="testng" rev="6.8"/>
<dependency org="org.uncommons" name="reportng" rev="1.1.2"/>
<dependency org="com.google.caliper" name="caliper" rev="0.5-rc1"/>
<dependency org="com.google.inject" name="guice" rev="3.0"/>
<!-- Contracts for Java and dependencies -->
<dependency org="com.google.code.cofoja" name="cofoja" rev="1.0-r139"/>
<dependency org="asm" name="asm-all" rev="3.3.1"/>
<!-- POI, for reading pipeline files -->
<dependency org="org.apache.poi" name="poi" rev="3.8-beta3"/>
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3"/>
<!-- snpEff annotator for pipelines -->
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.5"/>
<!-- MongoDB for the GXDB project -->
<dependency org="org.mongodb" name="mongo-java-driver" rev="2.7.3"/>
<!-- GSON and HTTP for talking to the REST API on Vanilla Forums -->
<dependency org="com.google.code.gson" name="gson" rev="2.2.2"/>
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.1.1"/>
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
<exclude org="javax.servlet"/>
<exclude org="javax.jms"/>
<exclude org="com.sun.*"/>
</dependencies>
</ivy-module>

858
pom.xml 100644
View File

@ -0,0 +1,858 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<!--
This pom is the aggregator for all sting/gatk poms
See also:
http://maven.apache.org/pom.html#Inheritance_v
http://maven.apache.org/guides/introduction/introduction-to-the-pom.html#Project_Inheritance_vs_Project_Aggregation
http://stackoverflow.com/questions/1992213/maven-parent-pom-vs-modules-pom
-->
<parent>
<groupId>org.broadinstitute.sting</groupId>
<artifactId>sting-root</artifactId>
<version>3.0</version>
<relativePath>public/sting-root</relativePath>
</parent>
<artifactId>sting-aggregator</artifactId>
<packaging>pom</packaging>
<name>Sting Aggregator</name>
<modules>
<module>public</module>
<!-- private & protected optionally enabled as profiles -->
</modules>
<properties>
<sting.basedir>${project.basedir}</sting.basedir>
<resource.bundle.path>StingText.properties</resource.bundle.path>
<resource.bundle.skip>false</resource.bundle.skip>
<!-- TODO: Need a better a way to say "don't include hidden" by default -->
<gatkdocs.include.hidden>-build-timestamp "${maven.build.timestamp}"</gatkdocs.include.hidden>
<!--
Phases of the build that may be disabled to speed up compilation.
-->
<sting.jar.phase>package</sting.jar.phase>
<sting.generate-resources.phase>generate-resources</sting.generate-resources.phase>
<sting.process-resources.phase>process-resources</sting.process-resources.phase>
<sting.process-test-resources.phase>process-test-resources</sting.process-test-resources.phase>
<!--
Package tests ensure the consistency of the packaged / shaded jars.
It runs the tests where the monolithic jar is the only dependency on the classpath.
-->
<sting.packagecommittests.skipped>true</sting.packagecommittests.skipped>
<sting.packageunittests.skipped>${sting.packagecommittests.skipped}</sting.packageunittests.skipped>
<sting.packageintegrationtests.skipped>${sting.packagecommittests.skipped}</sting.packageintegrationtests.skipped>
<sting.packagepipelinetests.skipped>${sting.packagecommittests.skipped}</sting.packagepipelinetests.skipped>
<sting.packagelargescaletests.skipped>true</sting.packagelargescaletests.skipped>
<sting.packageknowledgebasetests.skipped>true</sting.packageknowledgebasetests.skipped>
<!--
Serial tests use the test jars to run tests, such that all tests are run from a single TestNG invocation.
This is different that the invoker, that runs the test classes from the filesystem, but pointing at the packaged JAR files.
TODO: Currently, all tests run within each package, since packages already collect dependencies for shading an uber jar.
TODO: Should there be another level up of tests, possibly running "all tests" via this aggregator level?
TODO: If that require the aggregator to be dependent on the child dependencies, perhaps a better approach might be another monolithic test project.
-->
<sting.serialcommittests.skipped>true</sting.serialcommittests.skipped>
<sting.serialunittests.skipped>${sting.serialcommittests.skipped}</sting.serialunittests.skipped>
<sting.serialintegrationtests.skipped>${sting.serialcommittests.skipped}</sting.serialintegrationtests.skipped>
<sting.serialpipelinetests.skipped>${sting.serialcommittests.skipped}</sting.serialpipelinetests.skipped>
<sting.seriallargescaletests.skipped>true</sting.seriallargescaletests.skipped>
<sting.serialknowledgebasetests.skipped>true</sting.serialknowledgebasetests.skipped>
</properties>
<dependencies>
<dependency>
<groupId>com.sun</groupId>
<artifactId>tools</artifactId>
</dependency>
</dependencies>
<build>
<!-- Plugin configuration -->
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<configuration>
<filesets>
<!--
TODO: Once GATKDoclet stops hard coding paths, we remove this and leave
TODO: it to the standard maven conventions to clean up for us
-->
<fileset>
<directory>gatkdocs</directory>
</fileset>
<fileset>
<directory>${basedir}</directory>
<includes>
<include>javadoc.sh</include>
<include>options</include>
<include>packages</include>
</includes>
</fileset>
<!--
Shade dumps this temp file in basedir, with no good way to reconfigure.
https://jira.codehaus.org/browse/MSHADE-145
-->
<fileset>
<directory>${basedir}</directory>
<includes>
<include>dependency-reduced-pom.xml</include>
</includes>
</fileset>
</filesets>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>unpack-direct-dependencies</id>
<goals>
<goal>unpack-dependencies</goal>
</goals>
<phase>none</phase>
<configuration>
<excludeTransitive>true</excludeTransitive>
<outputDirectory>${project.build.outputDirectory}</outputDirectory>
<includeTypes>jar</includeTypes>
<excludeScope>system</excludeScope>
</configuration>
</execution>
</executions>
</plugin>
<!-- TODO: Change the ResourceBundleExtractorDoclet to not require log4j.properties file -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>default-resources</id>
<goals>
<goal>resources</goal>
</goals>
<phase>${sting.process-resources.phase}</phase>
</execution>
<execution>
<id>default-testResources</id>
<goals>
<goal>testResources</goal>
</goals>
<phase>${sting.process-test-resources.phase}</phase>
</execution>
<execution>
<id>copy-resource-bundle-log4j</id>
<goals>
<goal>copy-resources</goal>
</goals>
<phase>none</phase>
<configuration>
<outputDirectory>${project.reporting.outputDirectory}/apidocs</outputDirectory>
<resources>
<resource>
<directory>${sting.basedir}/sting-utils/src/main/config/org/broadinstitute/sting/utils/help</directory>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>extract-resource-bundle</id>
<goals>
<goal>javadoc</goal>
</goals>
<phase>none</phase>
<configuration>
<!-- Allow skipping for "fasttest" -->
<skip>${resource.bundle.skip}</skip>
<doclet>org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet</doclet>
<!-- Required as doclet uses reflection to access classes for documentation, instead of source java-->
<docletPath>${project.build.outputDirectory}</docletPath>
<docletArtifact>
<groupId>${project.groupId}</groupId>
<!-- TODO: THIS IS SUPPOSED TO BE STING-UTILS! -->
<artifactId>gatk-framework</artifactId>
<version>${project.version}</version>
</docletArtifact>
<maxmemory>2g</maxmemory>
<useStandardDocletOptions>false</useStandardDocletOptions>
<quiet>true</quiet>
<additionalparam>-build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} -out ${project.build.outputDirectory}/${resource.bundle.path}</additionalparam>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<proc>none</proc>
<annotationProcessors>
<annotationProcessor>com.google.java.contract.core.apt.AnnotationProcessor</annotationProcessor>
</annotationProcessors>
</configuration>
<executions>
<execution>
<id>default-compile</id>
<phase>none</phase>
</execution>
<execution>
<id>default-testCompile</id>
<phase>none</phase>
</execution>
<!--
Explicit package-info creation to match existing ant output.
-->
<execution>
<id>compile-package-info</id>
<goals>
<goal>compile</goal>
</goals>
<phase>compile</phase>
<configuration>
<compilerArgs>
<arg>-Xpkginfo:always</arg>
</compilerArgs>
<includes>
<include>**/package-info.java</include>
</includes>
</configuration>
</execution>
<!--
TODO: Currently disabled in build.xml. Here as well.
<execution>
<id>compile-annotations</id>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<proc>only</proc>
</configuration>
</execution>
-->
<execution>
<id>compile-java</id>
<goals>
<goal>compile</goal>
</goals>
<phase>compile</phase>
<configuration>
<!--
Package info is supposed to be in source:
http://maven.apache.org/plugins/maven-javadoc-plugin/examples/javadoc-resources.html
But maven-compile-plugin doesn't auto exclude these:
https://jira.codehaus.org/browse/MCOMPILER-205?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=326505#comment-326505
So, explicitly exclude them
-->
<excludes>
<exclude>**/package-info.java</exclude>
</excludes>
</configuration>
</execution>
<!--
TODO: Currently disabled in build.xml. Here as well.
<execution>
<id>testCompile-annotations</id>
<phase>test-compile</phase>
<goals>
<goal>testCompile</goal>
</goals>
<configuration>
<proc>only</proc>
</configuration>
</execution>
-->
<execution>
<id>testCompile-java</id>
<goals>
<goal>testCompile</goal>
</goals>
<phase>test-compile</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>default-jar</id>
<phase>${sting.jar.phase}</phase>
</execution>
<execution>
<id>test-jar</id>
<goals>
<goal>test-jar</goal>
</goals>
<phase>${sting.jar.phase}</phase>
<configuration>
<skipIfEmpty>true</skipIfEmpty>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<id>sting-executable</id>
<goals>
<goal>shade</goal>
</goals>
<phase>none</phase>
<configuration>
<minimizeJar>true</minimizeJar>
<artifactSet>
<excludes>
<exclude>org.broadinstitute.sting:gsalib:tar.gz:*</exclude>
<exclude>org.broadinstitute.sting:*:tar.bz2:example-resources</exclude>
</excludes>
</artifactSet>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<manifestEntries>
<Main-Class>${app.main.class}</Main-Class>
</manifestEntries>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>${resource.bundle.path}</resource>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>example-resources</id>
<goals>
<goal>single</goal>
</goals>
<phase>none</phase>
<configuration>
<descriptors>
<descriptor>src/main/assembly/example-resources.xml</descriptor>
</descriptors>
</configuration>
</execution>
<execution>
<id>binary-dist</id>
<goals>
<goal>single</goal>
</goals>
<phase>none</phase>
<configuration>
<descriptors>
<descriptor>src/main/assembly/binary-dist.xml</descriptor>
</descriptors>
</configuration>
</execution>
</executions>
</plugin>
<!-- TODO: Remove temporary symbolic link creation, after fixing test paths to use local resources. -->
<plugin>
<groupId>com.pyx4j</groupId>
<artifactId>maven-junction-plugin</artifactId>
<executions>
<execution>
<id>link-public-testdata</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/public/testdata</dst>
<src>${sting.basedir}/public/gatk-framework/src/test/resources</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>unlink-public-testdata</id>
<goals>
<goal>unlink</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/public/testdata</dst>
<src>${sting.basedir}/public/gatk-framework/src/test/resources</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>link-private-testdata</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/private/testdata</dst>
<src>${sting.basedir}/private/gatk-private/src/test/resources</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>unlink-private-testdata</id>
<goals>
<goal>unlink</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/private/testdata</dst>
<src>${sting.basedir}/private/gatk-private/src/test/resources</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>link-public-qscript</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/public/scala/qscript</dst>
<src>${sting.basedir}/public/queue-framework/src/main/qscripts</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>unlink-public-qscript</id>
<goals>
<goal>unlink</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/public/scala/qscript</dst>
<src>${sting.basedir}/public/queue-framework/src/main/qscripts</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>link-private-qscript</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/private/scala/qscript</dst>
<src>${sting.basedir}/private/queue-private/src/main/qscripts</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>unlink-private-qscript</id>
<goals>
<goal>unlink</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${basedir}/private/scala/qscript</dst>
<src>${sting.basedir}/private/queue-private/src/main/qscripts</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>link-binary-jar</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${sting.basedir}/target/${sting.binary-dist.name}.${project.packaging}</dst>
<src>${project.build.directory}/${project.build.finalName}.${project.packaging}</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>link-git-release</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${project.build.directory}/${sting.binary-dist.name}-${build.version}.tar.bz2</dst>
<src>${project.build.directory}/${project.build.finalName}-binary-dist.tar.bz2</src>
</link>
</links>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-invoker-plugin</artifactId>
<configuration>
<skipInvocation>true</skipInvocation>
<debug>false</debug>
<pom>${sting.basedir}/public/package-tests/pom.xml</pom>
<noLog>true</noLog>
<streamLogs>true</streamLogs>
<localRepositoryPath>${sting.basedir}/${maven.repo.local}</localRepositoryPath>
<properties>
<test>${test}</test>
<it.test>${it.test}</it.test>
<skip>false</skip>
<maven.test.skip>false</maven.test.skip>
<sting.packagetests.artifactId>${sting.packagetests.artifactId}</sting.packagetests.artifactId>
<sting.packagetests.testClasses>${project.build.testOutputDirectory}</sting.packagetests.testClasses>
<sting.packagetests.basedir>${project.basedir}</sting.packagetests.basedir>
<sting.pipelinetests.run>${sting.pipelinetests.run}</sting.pipelinetests.run>
<maven.surefire.debug>${maven.surefire.debug}</maven.surefire.debug>
<maven.failsafe.debug>${maven.failsafe.debug}</maven.failsafe.debug>
</properties>
<!--
To allow using separated integration-test and verify, each execution must use a separate reportsDirectory.
Otherwise, when an empty "pipeline test" later runs on an IntegrationTest class,
it overwrites the results of the previous invocation always with a zero exit status.
Mixing in the test name into the reportsDirectory also avoids collisions, when different maven jobs run tests in parallel.
Similarly generating unique failsafe summary reports to avoid collisions.
-->
</configuration>
<executions>
<execution>
<id>package-unittests</id>
<goals>
<goal>run</goal>
</goals>
<configuration>
<goals>
<goal>test</goal>
</goals>
<reportsDirectory>${project.build.directory}/invoker-reports/unit/${test}</reportsDirectory>
<skipInvocation>${sting.packageunittests.skipped}</skipInvocation>
<properties>
<unittests.profile.enabled>true</unittests.profile.enabled>
<sting.packageunittests.skipped>${sting.packageunittests.skipped}</sting.packageunittests.skipped>
</properties>
</configuration>
</execution>
<execution>
<id>package-integrationtests</id>
<goals>
<goal>integration-test</goal>
<goal>verify</goal>
</goals>
<configuration>
<goals>
<goal>verify</goal>
</goals>
<reportsDirectory>${project.build.directory}/invoker-reports/integration/${it.test}</reportsDirectory>
<skipInvocation>${sting.packageintegrationtests.skipped}</skipInvocation>
<properties>
<integrationtests.profile.enabled>true</integrationtests.profile.enabled>
<sting.packageintegrationtests.skipped>${sting.packageintegrationtests.skipped}</sting.packageintegrationtests.skipped>
</properties>
</configuration>
</execution>
<execution>
<id>package-pipelinetests</id>
<goals>
<goal>integration-test</goal>
<goal>verify</goal>
</goals>
<configuration>
<goals>
<goal>verify</goal>
</goals>
<reportsDirectory>${project.build.directory}/invoker-reports/pipeline/${it.test}</reportsDirectory>
<skipInvocation>${sting.packagepipelinetests.skipped}</skipInvocation>
<properties>
<integrationtests.profile.enabled>true</integrationtests.profile.enabled>
<sting.packagepipelinetests.skipped>${sting.packagepipelinetests.skipped}</sting.packagepipelinetests.skipped>
</properties>
</configuration>
</execution>
<execution>
<id>package-largescaletests</id>
<goals>
<goal>integration-test</goal>
<goal>verify</goal>
</goals>
<configuration>
<goals>
<goal>verify</goal>
</goals>
<reportsDirectory>${project.build.directory}/invoker-reports/largescale/${it.test}</reportsDirectory>
<skipInvocation>${sting.packagelargescaletests.skipped}</skipInvocation>
<properties>
<integrationtests.profile.enabled>true</integrationtests.profile.enabled>
<sting.packagelargescaletests.skipped>${sting.packagelargescaletests.skipped}</sting.packagelargescaletests.skipped>
</properties>
</configuration>
</execution>
<execution>
<id>package-knowledgebasetests</id>
<goals>
<goal>integration-test</goal>
<goal>verify</goal>
</goals>
<configuration>
<goals>
<goal>verify</goal>
</goals>
<reportsDirectory>${project.build.directory}/invoker-reports/knowledgebase/${it.test}</reportsDirectory>
<skipInvocation>${sting.packageknowledgebasetests.skipped}</skipInvocation>
<properties>
<integrationtests.profile.enabled>true</integrationtests.profile.enabled>
<sting.packageknowledgebasetests.skipped>${sting.packageknowledgebasetests.skipped}</sting.packageknowledgebasetests.skipped>
</properties>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>2.5</version>
<executions>
<execution>
<id>install-package</id>
<goals>
<goal>install-file</goal>
</goals>
<phase>none</phase>
<configuration>
<generatePom>true</generatePom>
<groupId>${project.groupId}</groupId>
<artifactId>${project.artifactId}</artifactId>
<version>${project.version}</version>
<packaging>${project.packaging}</packaging>
<file>${project.build.directory}/${project.build.finalName}.${project.packaging}</file>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
<!-- Invoke plugins that always run -->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
</plugin>
<plugin>
<groupId>com.pyx4j</groupId>
<artifactId>maven-junction-plugin</artifactId>
<executions>
<execution>
<id>link-public-testdata</id>
<phase>process-test-resources</phase>
</execution>
<execution>
<id>unlink-public-testdata</id>
<phase>clean</phase>
</execution>
<execution>
<id>link-public-qscript</id>
<phase>process-test-resources</phase>
</execution>
<execution>
<id>unlink-public-qscript</id>
<phase>clean</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
</plugin>
<plugin>
<groupId>com.google.code.sortpom</groupId>
<artifactId>maven-sortpom-plugin</artifactId>
<executions>
<execution>
<id>package-tests</id>
<goals>
<goal>sort</goal>
</goals>
<phase>verify</phase>
<inherited>false</inherited>
<configuration>
<pomFile>public/package-tests/pom.xml</pomFile>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<reportSets>
<!-- By not specifying an id, shut off all default javadocs from mvn site -->
<reportSet>
<reports />
</reportSet>
<reportSet>
<id>generate-gatk-docs</id>
<reports>
<report>aggregate</report>
</reports>
<!-- Only generate the GATK Docs across the parent aggregation, not the children too. -->
<inherited>false</inherited>
<configuration>
<doclet>org.broadinstitute.sting.utils.help.GATKDoclet</doclet>
<docletArtifact>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-package</artifactId>
<version>${project.version}</version>
</docletArtifact>
<useStandardDocletOptions>false</useStandardDocletOptions>
<quiet>true</quiet>
<show>private</show>
<additionalparam>-build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} ${gatkdocs.include.hidden} -settings-dir ${sting.basedir}/settings/helpTemplates -destination-dir ${project.build.directory}/gatkdocs</additionalparam>
</configuration>
</reportSet>
</reportSets>
</plugin>
</plugins>
</reporting>
<profiles>
<!-- Optionally include protected -->
<profile>
<id>protected</id>
<activation>
<file>
<exists>${basedir}/protected/pom.xml</exists>
</file>
</activation>
<modules>
<module>protected</module>
</modules>
</profile>
<!-- Optionally include private -->
<profile>
<id>private</id>
<activation>
<file>
<exists>${basedir}/private/pom.xml</exists>
</file>
</activation>
<modules>
<module>private</module>
</modules>
<build>
<plugins>
<!-- TODO: All tests require access to private. For now. -->
<plugin>
<groupId>com.pyx4j</groupId>
<artifactId>maven-junction-plugin</artifactId>
<executions>
<execution>
<id>link-private-testdata</id>
<phase>process-test-resources</phase>
</execution>
<execution>
<id>unlink-private-testdata</id>
<phase>clean</phase>
</execution>
<execution>
<id>link-private-qscript</id>
<phase>process-test-resources</phase>
</execution>
<execution>
<id>unlink-private-qscript</id>
<phase>clean</phase>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<!-- Collection of properties for use during package testing -->
<profile>
<id>packagetests-enabled</id>
<activation>
<property>
<name>sting.packagetests.enabled</name>
<value>true</value>
</property>
</activation>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
<sting.generate-gatk-extensions.skipped>true</sting.generate-gatk-extensions.skipped>
<sting.jar.phase>none</sting.jar.phase>
<sting.generate-resources.phase>none</sting.generate-resources.phase>
<sting.process-resources.phase>none</sting.process-resources.phase>
<sting.process-test-resources.phase>none</sting.process-test-resources.phase>
</properties>
</profile>
</profiles>
</project>

View File

@ -0,0 +1,139 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.broadinstitute.sting</groupId>
<artifactId>sting-aggregator</artifactId>
<version>3.0</version>
<relativePath>../..</relativePath>
</parent>
<artifactId>gatk-protected</artifactId>
<packaging>jar</packaging>
<name>GATK Protected</name>
<properties>
<sting.basedir>${project.basedir}/../..</sting.basedir>
<sting.packagetests.artifactId>gatk-package</sting.packagetests.artifactId>
</properties>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-framework</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</dependency>
<dependency>
<groupId>gov.nist.math</groupId>
<artifactId>jama</artifactId>
</dependency>
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-framework</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.caliper</groupId>
<artifactId>caliper</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-resource-bundle-log4j</id>
<phase>prepare-package</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>extract-resource-bundle</id>
<phase>prepare-package</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-invoker-plugin</artifactId>
<executions>
<execution>
<id>package-unittests</id>
</execution>
<execution>
<id>package-integrationtests</id>
</execution>
<execution>
<id>package-largescaletests</id>
</execution>
<execution>
<id>package-knowledgebasetests</id>
</execution>
<execution>
<id>package-pipelinetests</id>
</execution>
</executions>
</plugin>
</plugins>
</build>
<profiles>
<profile>
<id>private</id>
<activation>
<file>
<exists>${basedir}/../../private/gatk-private/pom.xml</exists>
</file>
</activation>
<build>
<plugins>
<!-- TODO: All tests require access to private. For now. -->
<plugin>
<groupId>com.pyx4j</groupId>
<artifactId>maven-junction-plugin</artifactId>
<executions>
<execution>
<id>link-private-testdata</id>
<phase>process-test-resources</phase>
</execution>
<execution>
<id>unlink-private-testdata</id>
<phase>clean</phase>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>

View File

@ -0,0 +1,114 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Total (unfiltered) depth over all samples.
*
* <p>While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's
* internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
* over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
* N samples with -dcov D is N * D
* </p>
*/
public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
int depth = 0;
if (stratifiedContexts != null) {
if ( stratifiedContexts.size() == 0 )
return null;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
depth += sample.getValue().getBasePileup().depthOfCoverage();
}
else if (perReadAlleleLikelihoodMap != null) {
if ( perReadAlleleLikelihoodMap.size() == 0 )
return null;
for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
depth += maps.getLikelihoodReadMap().size();
}
}
else
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", depth));
return map;
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
}
}

View File

@ -0,0 +1,164 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
/**
* The depth of coverage of each allele per sample
*
* <p>The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
* sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
* caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
* REF and ALT fields) is the unfiltered count of all reads that carried with them the
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
* the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
* Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
* many non-informative reads.</p>
*
* <p>Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation,
* <b>one should not base assumptions about the underlying genotype based on it</b>;
* instead, the genotype likelihoods (PLs) are what determine the genotype calls.</p>
*
*/
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
public void annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final AlignmentContext stratifiedContext,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
return;
if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
annotateWithLikelihoods(alleleLikelihoodMap, vc, gb);
else if ( stratifiedContext != null && (vc.isSNP()))
annotateWithPileup(stratifiedContext, vc, gb);
}
private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) {
final HashMap<Byte, Integer> alleleCounts = new HashMap<>();
for ( final Allele allele : vc.getAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
final ReadBackedPileup pileup = stratifiedContext.getBasePileup();
for ( final PileupElement p : pileup ) {
if ( alleleCounts.containsKey(p.getBase()) )
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
// we need to add counts in the correct order
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
gb.AD(counts);
}
private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
final HashMap<Allele, Integer> alleleCounts = new HashMap<>();
for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); }
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
if (! a.isInformative() ) continue; // read is non-informative
final GATKSAMRecord read = el.getKey();
final int prevCount = alleleCounts.get(a.getMostLikelyAllele());
alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1);
}
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference());
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) );
gb.AD(counts);
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
}
}

View File

@ -0,0 +1,126 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
import java.util.*;
/**
* The depth of coverage for informative reads for each sample.
*
* An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a
* case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear
* whether the read contains the reference allele or e.g. an extra repeat.
* The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can
* only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator).
* This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field
* (i.e., sum(AD) = DP).
*/
public class DepthPerSampleHC extends GenotypeAnnotation {
public void annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final AlignmentContext stratifiedContext,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
return;
if (alleleLikelihoodMap == null )
throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller");
// the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot
// differentiate between reads that align over the event but aren't informative vs. those that aren't even
// close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
int dp = 0;
if ( alleleLikelihoodMap.isEmpty() ) {
// there are no reads
} else {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet());
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
if ( a.isInformative() ) {
dp++;
}
}
gb.DP(dp);
}
}
public List<String> getKeyNames() {
return Collections.singletonList(VCFConstants.DEPTH_KEY);
}
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
}
}

View File

@ -0,0 +1,509 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import cern.jet.math.Arithmetic;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypesContext;
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
/**
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias
*
* <p>Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
* being seen on only the forward or only the reverse strand) in the reads. More bias is
* indicative of false positive calls.
* </p>
*
* <h3>Caveat</h3>
* <p>The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.</p>
*/
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private final static boolean ENABLE_DEBUGGING = false;
private final static Logger logger = Logger.getLogger(FisherStrand.class);
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
private static final int MIN_COUNT = 2;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( !vc.isVariant() )
return null;
if ( vc.hasGenotypes() ) {
final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() );
if ( tableFromPerSampleAnnotations != null ) {
return pValueForBestTable(tableFromPerSampleAnnotations, null);
}
}
if (vc.isSNP() && stratifiedContexts != null) {
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1);
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST);
printTable("unfiltered", tableNoFiltering);
printTable("filtered", tableFiltering);
return pValueForBestTable(tableFiltering, tableNoFiltering);
}
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
// either SNP with no alignment context, or indels: per-read likelihood map needed
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
// logger.info("VC " + vc);
// printTable(table, 0.0);
return pValueForBestTable(table, null);
}
else
// for non-snp variants, we need per-read likelihoods.
// for snps, we can get same result from simple pileup
return null;
}
/**
* Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together
* @param genotypes the genotypes from which to pull out the per-sample strand bias annotation
* @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation
*/
private int[][] getTableFromSamples( final GenotypesContext genotypes ) {
if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); }
final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse
boolean foundData = false;
for( final Genotype g : genotypes ) {
if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) )
continue;
foundData = true;
final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME);
final int[] data = encodeSBBS(sbbsString);
if ( passesMinimumThreshold(data) ) {
for( int index = 0; index < sbArray.length; index++ ) {
sbArray[index] += data[index];
}
}
}
return ( foundData ? decodeSBBS(sbArray) : null );
}
/**
* Does this strand data array pass the minimum threshold for inclusion?
*
* @param data the array
* @return true if it passes the minimum threshold, false otherwise
*/
private static boolean passesMinimumThreshold(final int[] data) {
// the ref and alt totals must each be greater than MIN_COUNT
return data[0] + data[1] > MIN_COUNT && data[2] + data[3] > MIN_COUNT;
}
/**
* Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
*
* @param table1 a contingency table, may be null
* @param table2 a contingency table, may be null
* @return annotation result for FS given tables
*/
private Map<String, Object> pValueForBestTable(final int[][] table1, final int[][] table2) {
if ( table2 == null )
return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1));
else if (table1 == null)
return annotationForOneTable(pValueForContingencyTable(table2));
else { // take the one with the best (i.e., least significant pvalue)
double pvalue1 = pValueForContingencyTable(table1);
double pvalue2 = pValueForContingencyTable(table2);
return annotationForOneTable(Math.max(pvalue1, pvalue2));
}
}
/**
* Returns an annotation result given a pValue
*
* @param pValue
* @return a hash map from FS -> phred-scaled pValue
*/
private Map<String, Object> annotationForOneTable(final double pValue) {
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
return Collections.singletonMap(FS, value);
}
public List<String> getKeyNames() {
return Collections.singletonList(FS);
}
public List<VCFInfoHeaderLine> getDescriptions() {
return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
}
/**
* Helper function to turn the FisherStrand table into the SB annotation array
* @param table the table used by the FisherStrand annotation
* @return the array used by the per-sample Strand Bias annotation
*/
public static List<Integer> getContingencyArray( final int[][] table ) {
if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
final List<Integer> list = new ArrayList<>(4); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change
list.add(table[0][0]);
list.add(table[0][1]);
list.add(table[1][0]);
list.add(table[1][1]);
return list;
}
/**
* Helper function to parse the genotype annotation into the SB annotation array
* @param string the string that is returned by genotype.getAnnotation("SB")
* @return the array used by the per-sample Strand Bias annotation
*/
private static int[] encodeSBBS( final String string ) {
final int[] array = new int[4];
final StringTokenizer tokenizer = new StringTokenizer(string, ",", false);
for( int index = 0; index < 4; index++ ) {
array[index] = Integer.parseInt(tokenizer.nextToken());
}
return array;
}
/**
* Helper function to turn the SB annotation array into the FisherStrand table
* @param array the array used by the per-sample Strand Bias annotation
* @return the table used by the FisherStrand annotation
*/
private static int[][] decodeSBBS( final int[] array ) {
if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); }
final int[][] table = new int[2][2];
table[0][0] = array[0];
table[0][1] = array[1];
table[1][0] = array[2];
table[1][1] = array[3];
return table;
}
private Double pValueForContingencyTable(int[][] originalTable) {
final int[][] normalizedTable = normalizeContingencyTable(originalTable);
int[][] table = copyContingencyTable(normalizedTable);
double pCutoff = computePValue(table);
//printTable(table, pCutoff);
double pValue = pCutoff;
while (rotateTable(table)) {
double pValuePiece = computePValue(table);
//printTable(table, pValuePiece);
if (pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
}
table = copyContingencyTable(normalizedTable);
while (unrotateTable(table)) {
double pValuePiece = computePValue(table);
//printTable(table, pValuePiece);
if (pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
}
//System.out.printf("P-cutoff: %f\n", pCutoff);
//System.out.printf("P-value: %f\n\n", pValue);
// min is necessary as numerical precision can result in pValue being slightly greater than 1.0
return Math.min(pValue, 1.0);
}
// how large do we want the normalized table to be?
private static final double TARGET_TABLE_SIZE = 200.0;
/**
* Normalize the table so that the entries are not too large.
* Note that this method does NOT necessarily make a copy of the table being passed in!
*
* @param table the original table
* @return a normalized version of the table or the original table if it is already normalized
*/
private static int[][] normalizeContingencyTable(final int[][] table) {
final int sum = table[0][0] + table[0][1] + table[1][0] + table[1][1];
if ( sum <= TARGET_TABLE_SIZE * 2 )
return table;
final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE;
final int[][] normalized = new int[2][2];
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2; j++ )
normalized[i][j] = (int)(table[i][j] / normalizationFactor);
}
return normalized;
}
private static int [][] copyContingencyTable(int [][] t) {
int[][] c = new int[2][2];
for ( int i = 0; i < 2; i++ )
for ( int j = 0; j < 2; j++ )
c[i][j] = t[i][j];
return c;
}
private static void printTable(int[][] table, double pValue) {
logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue));
}
/**
* Printing information to logger.info for debugging purposes
*
* @param name the name of the table
* @param table the table itself
*/
private void printTable(final String name, final int[][] table) {
if ( ENABLE_DEBUGGING ) {
final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS);
logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s",
name, table[0][0], table[0][1], table[1][0], table[1][1], pValue));
}
}
private static boolean rotateTable(int[][] table) {
table[0][0] -= 1;
table[1][0] += 1;
table[0][1] += 1;
table[1][1] -= 1;
return (table[0][0] >= 0 && table[1][1] >= 0);
}
private static boolean unrotateTable(int[][] table) {
table[0][0] += 1;
table[1][0] -= 1;
table[0][1] -= 1;
table[1][1] += 1;
return (table[0][1] >= 0 && table[1][0] >= 0);
}
private static double computePValue(int[][] table) {
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
int N = rowSums[0] + rowSums[1];
// calculate in log space so we don't die with high numbers
double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ Arithmetic.logFactorial(rowSums[1])
+ Arithmetic.logFactorial(colSums[0])
+ Arithmetic.logFactorial(colSums[1])
- Arithmetic.logFactorial(table[0][0])
- Arithmetic.logFactorial(table[0][1])
- Arithmetic.logFactorial(table[1][0])
- Arithmetic.logFactorial(table[1][1])
- Arithmetic.logFactorial(N);
return Math.exp(pCutoff);
}
private static int sumRow(int[][] table, int column) {
int sum = 0;
for (int r = 0; r < table.length; r++) {
sum += table[r][column];
}
return sum;
}
private static int sumColumn(int[][] table, int row) {
int sum = 0;
for (int c = 0; c < table[row].length; c++) {
sum += table[row][c];
}
return sum;
}
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
* allele1 # #
* allele2 # #
* @return a 2x2 contingency table
*/
public static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); }
if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); }
final Allele ref = vc.getReference();
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
final int[][] table = new int[2][2];
for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
final int[] myTable = new int[4];
for (final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
final GATKSAMRecord read = el.getKey();
updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt);
}
if ( passesMinimumThreshold(myTable) )
copyToMainTable(myTable, table);
}
return table;
}
/**
* Helper method to copy the per-sample table to the main table
*
* @param perSampleTable per-sample table (single dimension)
* @param mainTable main table (two dimensions)
*/
private static void copyToMainTable(final int[] perSampleTable, final int[][] mainTable) {
mainTable[0][0] += perSampleTable[0];
mainTable[0][1] += perSampleTable[1];
mainTable[1][0] += perSampleTable[2];
mainTable[1][1] += perSampleTable[3];
}
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
* allele1 # #
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getSNPContingencyTable(final Map<String, AlignmentContext> stratifiedContexts,
final Allele ref,
final Allele alt,
final int minQScoreToConsider ) {
int[][] table = new int[2][2];
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
final int[] myTable = new int[4];
for (PileupElement p : sample.getValue().getBasePileup()) {
if ( ! isUsableBase(p) ) // ignore deletions and bad MQ
continue;
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
continue;
updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt);
}
if ( passesMinimumThreshold(myTable) )
copyToMainTable(myTable, table);
}
return table;
}
/**
* Can the base in this pileup element be used in comparative tests?
*
* @param p the pileup element to consider
*
* @return true if this base is part of a meaningful read for comparison, false otherwise
*/
private static boolean isUsableBase(final PileupElement p) {
return !( p.isDeletion() ||
p.getMappingQual() == 0 ||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE);
}
private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) {
final boolean matchesRef = allele.equals(ref, true);
final boolean matchesAlt = allele.equals(alt, true);
if ( matchesRef || matchesAlt ) {
final int offset = matchesRef ? 0 : 2;
if ( read.isStrandless() ) {
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
// if the read is only seen once, because it's a merged read or other)
table[offset]++;
table[offset + 1]++;
} else {
// a normal read with an actual strand
final boolean isFW = !read.getReadNegativeStrandFlag();
table[offset + (isFW ? 0 : 1)]++;
}
}
}
}

View File

@ -0,0 +1,191 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypesContext;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
/**
* Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length.
*
* Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing
* reads associated with the samples with polymorphic genotypes.
*/
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
// private final static Logger logger = Logger.getLogger(QualByDepth.class);
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
if ( !vc.hasLog10PError() )
return null;
final GenotypesContext genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() == 0 )
return null;
int standardDepth = 0;
int ADrestrictedDepth = 0;
for ( final Genotype genotype : genotypes ) {
// we care only about variant calls with likelihoods
if ( !genotype.isHet() && !genotype.isHomVar() )
continue;
// if we have the AD values for this sample, let's make sure that the variant depth is greater than 1!
// TODO -- If we like how this is working and want to apply it to a situation other than the single sample HC pipeline,
// TODO -- then we will need to modify the annotateContext() - and related - routines in the VariantAnnotatorEngine
// TODO -- so that genotype-level annotations are run first (to generate AD on the samples) and then the site-level
// TODO -- annotations must come afterwards (so that QD can use the AD).
if ( genotype.hasAD() ) {
final int[] AD = genotype.getAD();
final int totalADdepth = (int)MathUtils.sum(AD);
if ( totalADdepth - AD[0] > 1 )
ADrestrictedDepth += totalADdepth;
standardDepth += totalADdepth;
continue;
}
if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) {
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null )
continue;
standardDepth += context.getBasePileup().depthOfCoverage();
} else if (perReadAlleleLikelihoodMap != null) {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName());
if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty())
continue;
standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements();
} else if ( genotype.hasDP() ) {
standardDepth += genotype.getDP();
}
}
// if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward
if ( ADrestrictedDepth > 0 )
standardDepth = ADrestrictedDepth;
if ( standardDepth == 0 )
return null;
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null));
// Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);
final Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.2f", QD));
return map;
}
/**
* Generate the indel normalization factor.
*
* @param altAlleleLength the average alternate allele length for the call
* @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length?
* @return a possitive double
*/
private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) {
return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
}
/**
* The haplotype caller generates very high quality scores when multiple events are on the
* same haplotype. This causes some very good variants to have unusually high QD values,
* and VQSR will filter these out. This code looks at the QD value, and if it is above
* threshold we map it down to the mean high QD value, with some jittering
*
* // TODO -- remove me when HaplotypeCaller bubble caller is live
*
* @param QD the raw QD score
* @return a QD value
*/
private double fixTooHighQD(final double QD) {
if ( QD < MAX_QD_BEFORE_FIXING ) {
return QD;
} else {
return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
}
}
private final static double MAX_QD_BEFORE_FIXING = 35;
private final static double IDEAL_HIGH_QD = 30;
private final static double JITTER_SIGMA = 3;
public List<String> getKeyNames() { return Arrays.asList("QD"); }
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth"));
}
}

View File

@ -0,0 +1,119 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
/**
* Root Mean Square of the mapping quality of the reads across all samples.
*/
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
final List<Integer> qualities = new ArrayList<>();
if ( stratifiedContexts != null ) {
if ( stratifiedContexts.size() == 0 )
return null;
for ( final Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
final AlignmentContext context = sample.getValue();
for ( final PileupElement p : context.getBasePileup() )
fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), qualities);
}
}
else if (perReadAlleleLikelihoodMap != null) {
if ( perReadAlleleLikelihoodMap.size() == 0 )
return null;
for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() )
fillMappingQualitiesFromPileup(read.getMappingQuality(), qualities);
}
}
else
return null;
final double rms = MathUtils.rms(qualities);
return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms));
}
private static void fillMappingQualitiesFromPileup(final int mq, final List<Integer> qualities) {
if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
qualities.add(mq);
}
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
}
}

View File

@ -0,0 +1,264 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypesContext;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
/**
* Abstract root for all RankSum based annotations
*/
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false;
private boolean useDithering = true;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
// either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
final GenotypesContext genotypes = vc.getGenotypes();
if (genotypes == null || genotypes.size() == 0)
return null;
final ArrayList<Double> refQuals = new ArrayList<>();
final ArrayList<Double> altQuals = new ArrayList<>();
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
boolean usePileup = true;
if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, refQuals, altQuals);
usePileup = false;
}
}
// the old UG SNP-only path through the annotations
if ( usePileup && stratifiedContexts != null ) {
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context != null ) {
final ReadBackedPileup pileup = context.getBasePileup();
if ( pileup != null )
fillQualsFromPileup(vc.getAlleles(), pileup, refQuals, altQuals);
}
}
}
if ( refQuals.isEmpty() && altQuals.isEmpty() )
return null;
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
for (final Double qual : altQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
}
for (final Double qual : refQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
}
if (DEBUG) {
System.out.format("%s, REF QUALS:", this.getClass().getName());
for (final Double qual : refQuals)
System.out.format("%4.1f ", qual);
System.out.println();
System.out.format("%s, ALT QUALS:", this.getClass().getName());
for (final Double qual : altQuals)
System.out.format("%4.1f ", qual);
System.out.println();
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final Pair<Double, Double> testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
final Map<String, Object> map = new HashMap<>();
if (!Double.isNaN(testResults.first))
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
return map;
}
private void fillQualsFromPileup(final List<Allele> alleles,
final ReadBackedPileup pileup,
final List<Double> refQuals,
final List<Double> altQuals) {
for ( final PileupElement p : pileup ) {
if ( isUsableBase(p) ) {
final Double value = getElementForPileupElement(p);
if ( value == null )
continue;
if ( alleles.get(0).equals(Allele.create(p.getBase(), true)) )
refQuals.add(value);
else if ( alleles.contains(Allele.create(p.getBase())) )
altQuals.add(value);
}
}
}
private void fillQualsFromLikelihoodMap(final List<Allele> alleles,
final int refLoc,
final PerReadAlleleLikelihoodMap likelihoodMap,
final List<Double> refQuals,
final List<Double> altQuals) {
for ( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
if ( ! a.isInformative() )
continue; // read is non-informative
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a);
if ( value == null )
continue;
if ( a.getMostLikelyAllele().isReference() )
refQuals.add(value);
else if ( alleles.contains(a.getMostLikelyAllele()) )
altQuals.add(value);
}
}
}
/**
* Get the element for the given read at the given reference position
*
* @param read the read
* @param refLoc the reference position
* @param mostLikelyAllele the most likely allele for this read
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
*/
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) {
return getElementForRead(read, refLoc);
}
/**
* Get the element for the given read at the given reference position
*
* @param read the read
* @param refLoc the reference position
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
*/
protected abstract Double getElementForRead(final GATKSAMRecord read, final int refLoc);
// TODO -- until the ReadPosRankSumTest stops treating these differently, we need to have separate methods for GATKSAMRecords and PileupElements. Yuck.
/**
* Get the element for the given read at the given reference position
*
* By default this function returns null, indicating that the test doesn't support the old style of pileup calculations
*
* @param p the pileup element
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
*/
protected Double getElementForPileupElement(final PileupElement p) {
// does not work in pileup mode
return null;
}
/**
* Can the base in this pileup element be used in comparative tests between ref / alt bases?
*
* Note that this function by default does not allow deletion pileup elements
*
* @param p the pileup element to consider
* @return true if this base is part of a meaningful read for comparison, false otherwise
*/
protected boolean isUsableBase(final PileupElement p) {
return !(p.isDeletion() ||
p.getMappingQual() == 0 ||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
}
/**
* Can the read be used in comparative tests between ref / alt bases?
*
* @param read the read to consider
* @param refLoc the reference location
* @return true if this read is meaningful for comparison, false otherwise
*/
protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
return !( read.getMappingQuality() == 0 ||
read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE );
}
/**
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
* engine randomization is turned off, and if so does not dither.
* @param walker the walker
* @param toolkit the GATK engine
* @param headerLines the header lines
*/
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
useDithering = ! toolkit.getArguments().disableDithering;
}
}

View File

@ -0,0 +1,105 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Fraction of reads containing spanning deletions at this site
*
* <p>Note that this annotation is currently not compatible with HaplotypeCaller.</p>
*/
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( stratifiedContexts.size() == 0 )
return null;
// not meaningful when we're at an indel location: deletions that start at location N are by definition called at the position N-1, and at position N-1
// there are no informative deletions in the pileup
if (!vc.isSNP())
return null;
int deletions = 0;
int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
for ( final PileupElement p : sample.getValue().getBasePileup() ) {
depth++;
if ( p.isDeletion() )
deletions++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));
return map;
}
public List<String> getKeyNames() { return Arrays.asList("Dels"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Dels", 1, VCFHeaderLineType.Float, "Fraction of Reads Containing Spanning Deletions")); }
}

View File

@ -0,0 +1,99 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
import java.util.*;
/**
* Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias
* User: rpoplin
* Date: 8/28/13
*/
public class StrandBiasBySample extends GenotypeAnnotation {
public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB";
@Override
public void annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final AlignmentContext stratifiedContext,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( ! isAppropriateInput(alleleLikelihoodMap, g) )
return;
final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc);
gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table));
}
@Override
public List<String> getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); }
@Override
public List<VCFFormatHeaderLine> getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); }
private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) {
return ! (map == null || g == null || !g.isCalled());
}
}

View File

@ -0,0 +1,536 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileHeader;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.recalibration.*;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
*
* <p>
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
* only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
* of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
* reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
* probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
* The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
* <p>
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
*
* <p>
*
* <h3>Input</h3>
* <p>
* The input read data whose base quality scores need to be assessed.
* <p>
* A database of known polymorphic sites to skip over.
* </p>
*
* <h3>Output</h3>
* <p>
* A GATK Report file with many tables:
* <ol>
* <li>The list of arguments</li>
* <li>The quantized qualities table</li>
* <li>The recalibration table by read group</li>
* <li>The recalibration table by quality score</li>
* <li>The recalibration table for all the optional covariates</li>
* </ol>
*
* The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T BaseRecalibrator \
* -I my_reads.bam \
* -R resources/Homo_sapiens_assembly18.fasta \
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
* -knownSites another/optional/setOfSitesToMask.vcf \
* -o recal_data.table
* </pre>
*/
@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class})
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@PartitionBy(PartitionType.READ)
public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSchedulable {
/**
* all the command line arguments for BQSR and it's covariates
*/
@ArgumentCollection
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/**
* When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
* purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag
* to safely access only one table. There may be some CPU cost, but as long as the table is really big
* there should be relatively little CPU costs.
*/
@Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false)
public boolean lowMemoryMode = false;
@Advanced
@Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
public double BAQGOP = BAQ.DEFAULT_GOP;
/**
* an object that keeps track of the information necessary for quality score quantization
*/
private QuantizationInfo quantizationInfo;
/**
* list to hold the all the covariate objects that were requested (required + standard + experimental)
*/
private Covariate[] requestedCovariates;
private RecalibrationEngine recalibrationEngine;
private int minimumQToUse;
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
/**
* Parse the -cov arguments and create a list of covariates to be used here
* Based on the covariates' estimates for initial capacity allocate the data hashmap
*/
public void initialize() {
baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
if (RAC.FORCE_PLATFORM != null)
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
if (RAC.LIST_ONLY) {
RecalUtils.listAvailableCovariates(logger);
System.exit(0);
}
RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
int covariateIndex = 0;
for (final Covariate covariate : requiredCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (final Covariate covariate : optionalCovariates)
requestedCovariates[covariateIndex++] = covariate;
logger.info("The covariates being used here: ");
for (Covariate cov : requestedCovariates) { // list all the covariates being used
logger.info("\t" + cov.getClass().getSimpleName());
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
}
try {
RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE);
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e);
}
initializeRecalibrationEngine();
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
referenceReader = getToolkit().getReferenceDataSource().getReference();
}
/**
* Initialize the recalibration engine
*/
private void initializeRecalibrationEngine() {
int numReadGroups = 0;
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
numReadGroups += header.getReadGroups().size();
recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode);
}
private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
return read.getBaseQualities()[offset] < minimumQToUse;
}
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
*/
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
RecalUtils.parsePlatformForRead(read, RAC);
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
return 0L; // skip this read completely
}
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
final int nErrors = nEvents(isSNP, isInsertion, isDeletion);
// note for efficiency regions we don't compute the BAQ array unless we actually have
// some error to marginalize over. For ILMN data ~85% of reads have no error
final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read);
if( baqArray != null ) { // some reads just can't be BAQ'ed
final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
// aggregate all of the info into our info object, and update the data
final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
recalibrationEngine.updateDataForRead(info);
return 1L;
} else {
return 0L;
}
}
/**
* Compute the number of mutational events across all hasEvent vectors
*
* Simply the sum of entries in hasEvents
*
* @param hasEvents a vector a vectors of 0 (no event) and 1 (has event)
* @return the total number of events across all hasEvent arrays
*/
protected static int nEvents(final int[]... hasEvents) {
int n = 0;
for ( final int[] hasEvent : hasEvents ) {
n += MathUtils.sum(hasEvent);
}
return n;
}
protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
final byte[] bases = read.getReadBases();
final boolean[] skip = new boolean[bases.length];
final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
for( int iii = 0; iii < bases.length; iii++ ) {
skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
}
return skip;
}
protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
}
protected static boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
final int readLength = read.getReadBases().length;
final boolean[] knownSites = new boolean[readLength];
Arrays.fill(knownSites, false);
for( final Feature f : features ) {
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureStartOnRead = 0;
}
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureEndOnRead = readLength;
}
if( featureStartOnRead > readLength ) {
featureStartOnRead = featureEndOnRead = readLength;
}
Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
}
return knownSites;
}
// BUGBUG: can be merged with calculateIsIndel
protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
final byte[] readBases = read.getReadBases();
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
final int[] snp = new int[readBases.length];
int readPos = 0;
int refPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
switch (ce.getOperator()) {
case M:
case EQ:
case X:
for( int iii = 0; iii < elementLength; iii++ ) {
snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
readPos++;
refPos++;
}
break;
case D:
case N:
refPos += elementLength;
break;
case I:
case S: // ReferenceContext doesn't have the soft clipped bases!
readPos += elementLength;
break;
case H:
case P:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
}
}
return snp;
}
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
final int[] indel = new int[read.getReadBases().length];
int readPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
switch (ce.getOperator()) {
case M:
case EQ:
case X:
case S:
{
readPos += elementLength;
break;
}
case D:
{
final int index = ( read.getReadNegativeStrandFlag() ? readPos : readPos - 1 );
updateIndel(indel, index, mode, EventType.BASE_DELETION);
break;
}
case I:
{
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
if( forwardStrandRead ) {
updateIndel(indel, readPos - 1, mode, EventType.BASE_INSERTION);
}
readPos += elementLength;
if( !forwardStrandRead ) {
updateIndel(indel, readPos, mode, EventType.BASE_INSERTION);
}
break;
}
case N:
case H:
case P:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
}
}
return indel;
}
private static void updateIndel(final int[] indel, final int index, final EventType mode, final EventType requiredMode) {
if ( mode == requiredMode && index >= 0 && index < indel.length )
// protect ourselves from events at the start or end of the read (1D3M or 3M1D)
indel[index] = 1;
}
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
if(errorArray.length != baqArray.length ) {
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
}
final int BLOCK_START_UNSET = -1;
final double[] fractionalErrors = new double[baqArray.length];
Arrays.fill(fractionalErrors, 0.0);
boolean inBlock = false;
int blockStartIndex = BLOCK_START_UNSET;
int iii;
for( iii = 0; iii < fractionalErrors.length; iii++ ) {
if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
if( !inBlock ) {
fractionalErrors[iii] = (double) errorArray[iii];
} else {
calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
inBlock = false; // reset state variables
blockStartIndex = BLOCK_START_UNSET; // reset state variables
}
} else {
inBlock = true;
if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
}
}
if( inBlock ) {
calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
}
if( fractionalErrors.length != errorArray.length ) {
throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
}
return fractionalErrors;
}
private static void calculateAndStoreErrorsInBlock( final int iii,
final int blockStartIndex,
final int[] errorArray,
final double[] fractionalErrors ) {
int totalErrors = 0;
for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
totalErrors += errorArray[jjj];
}
for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
}
}
/**
* Create a BAQ style array that indicates no alignment uncertainty
* @param read the read for which we want a BAQ array
* @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values
* // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above
*/
protected static byte[] flatBAQArray(final GATKSAMRecord read) {
final byte[] baq = new byte[read.getReadLength()];
Arrays.fill(baq, NO_BAQ_UNCERTAINTY);
return baq;
}
/**
* Compute an actual BAQ array for read, based on its quals and the reference sequence
* @param read the read to BAQ
* @return a non-null BAQ tag array for read
*/
private byte[] calculateBAQArray( final GATKSAMRecord read ) {
baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
return BAQ.getBAQTag(read);
}
/**
* Initialize the reduce step by returning 0L
*
* @return returns 0L
*/
public Long reduceInit() {
return 0L;
}
/**
* The Reduce method doesn't do anything for this walker.
*
* @param mapped Result of the map. This value is immediately ignored.
* @param sum The summing CountedData used to output the CSV data
* @return returns The sum used to output the CSV data
*/
public Long reduce(Long mapped, Long sum) {
sum += mapped;
return sum;
}
@Override
public void onTraversalDone(Long result) {
recalibrationEngine.finalizeData();
logger.info("Calculating quantized quality scores...");
quantizeQualityScores();
logger.info("Writing recalibration report...");
generateReport();
logger.info("...done!");
logger.info("BaseRecalibrator was able to recalibrate " + result + " reads");
}
private RecalibrationTables getRecalibrationTable() {
return recalibrationEngine.getFinalRecalibrationTables();
}
/**
* go through the quality score table and use the # observations and the empirical quality score
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
* generate a quantization map (recalibrated_qual -> quantized_qual)
*/
private void quantizeQualityScores() {
quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS);
}
private void generateReport() {
RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
}
}

View File

@ -0,0 +1,292 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.*;
import java.util.*;
/**
* Code for determining which indels are segregating among the samples.
*
* This code is just a refactor of the original code from Guillermo in the UG.
*
* @author Mark DePristo
* @since 3/26/12
*/
public class ConsensusAlleleCounter {
final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class);
private final int minIndelCountForGenotyping;
private final boolean doMultiAllelicCalls;
private final double minFractionInOneSample;
public ConsensusAlleleCounter(final boolean doMultiAllelicCalls,
final int minIndelCountForGenotyping,
final double minFractionInOneSample) {
this.minIndelCountForGenotyping = minIndelCountForGenotyping;
this.doMultiAllelicCalls = doMultiAllelicCalls;
this.minFractionInOneSample = minFractionInOneSample;
}
/**
* Returns a list of Alleles at this locus that may be segregating
*
* @param ref
* @param contexts
* @param contextType
* @return
*/
public List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final Map<String, Integer> consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType);
return consensusCountsToAlleles(ref, consensusIndelStrings);
}
//
// TODO -- WARNING DOESN'T WORK WITH REDUCED READS
//
private Map<String, Integer> countConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final GenomeLoc loc = ref.getLocus();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedPileup indelPileup = context.getBasePileup();
insCount += indelPileup.getNumberOfInsertionsAfterThisElement();
delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
}
if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping )
return Collections.emptyMap();
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedPileup indelPileup = context.getBasePileup();
final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement();
final int nReadsOverall = indelPileup.getNumberOfElements();
if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) {
continue;
}
for (PileupElement p : indelPileup) {
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
if ( p.isBeforeInsertion() ) {
final String insertionBases = p.getBasesOfImmediatelyFollowingInsertion();
// edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB]
if ( insertionBases == null )
continue;
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (Map.Entry<String, Integer> s : consensusIndelStrings.entrySet()) {
cList.add(new Pair<String, Integer>(s.getKey(), s.getValue()));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(insertionBases)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (insertionBases.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(insertionBases,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(insertionBases,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(insertionBases)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (insertionBases.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(insertionBases,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(insertionBases,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(insertionBases)? consensusIndelStrings.get(insertionBases):0;
cList.add(new Pair<String, Integer>(insertionBases,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if ( p.isBeforeDeletionStart() ) {
final String deletionString = String.format("D%d",p.getLengthOfImmediatelyFollowingIndel());
int cnt = consensusIndelStrings.containsKey(deletionString)? consensusIndelStrings.get(deletionString):0;
consensusIndelStrings.put(deletionString,cnt+1);
}
}
}
return consensusIndelStrings;
}
private List<Allele> consensusCountsToAlleles(final ReferenceContext ref,
final Map<String, Integer> consensusIndelStrings) {
final GenomeLoc loc = ref.getLocus();
final Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
Allele refAllele, altAllele;
for (final Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet()) {
final String s = elt.getKey();
final int curCnt = elt.getValue();
int stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
final int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference - 1, startIdxInReference + dLen); // add reference padding
if (Allele.acceptableAlleleBases(refBases, false)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(ref.getBase(), false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
final String insertionBases = (char)ref.getBase() + s; // add reference padding
if (Allele.acceptableAlleleBases(insertionBases, false)) { // don't allow N's in insertions
refAllele = Allele.create(ref.getBase(), true);
altAllele = Allele.create(insertionBases, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(Arrays.asList(refAllele, altAllele));
builder.noGenotypes();
if (doMultiAllelicCalls) {
vcs.add(builder.make());
if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
break;
} else if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
return mergedVC.getAlleles();
}
}

View File

@ -0,0 +1,500 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(G | D) = P(G) * P(D | G)
*
* where
*
* P(D | G) = sum_i log10 P(bi | G)
*
* and
*
* P(bi | G) = 1 - P(error | q1) if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for homozygous genotypes and for heterozygous genotypes:
*
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
*
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
*
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE;
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);
protected final static double log10_3 = log10(3.0);
protected boolean VERBOSE = false;
//
// The fundamental data arrays associated with a Genotype Likelihoods object
//
protected double[] log10Likelihoods = null;
// TODO: don't calculate this each time through
protected double log10_PCR_error_3;
protected double log10_1_minus_PCR_error;
/**
* Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype
*
* @param PCR_error_rate the PCR error rate
*/
public DiploidSNPGenotypeLikelihoods(double PCR_error_rate) {
log10_PCR_error_3 = log10(PCR_error_rate) - log10_3;
log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate);
setToZero();
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone();
c.log10Likelihoods = log10Likelihoods.clone();
return c;
}
protected void setToZero() {
log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros
}
/**
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
* @return likelihoods array
*/
public double[] getLikelihoods() {
return log10Likelihoods;
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param ignoreBadBases should we ignore bad bases?
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
* @param minBaseQual the minimum base quality at which to consider a base valid
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
int n = 0;
// for each fragment, add to the likelihoods
FragmentCollection<PileupElement> fpile = pileup.toFragments();
for ( PileupElement p : fpile.getSingletonReads() )
n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
for ( List<PileupElement> overlappingPair : fpile.getOverlappingPairs() )
n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
return n;
}
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
byte obsBase = elt.getBase();
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qual == 0 )
return 0;
return add(obsBase, qual, (byte)0, (byte)0, 1);
}
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
final PileupElement p1 = overlappingPair.get(0);
final PileupElement p2 = overlappingPair.get(1);
final byte observedBase1 = p1.getBase();
final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
final byte observedBase2 = p2.getBase();
final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qualityScore1 == 0 ) {
if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
return 0;
else {
return add(observedBase2, qualityScore2, (byte)0, (byte)0);
}
} else {
return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
}
}
/**
*
* @param obsBase1 first observed base
* @param qual1 base qual of first observed base
* @param obsBase2 second observed base
* @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment
* @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads
* @return 0 if the base is bad, 1 otherwise
*/
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
// Just look up the cached result if it's available, or compute and store it
DiploidSNPGenotypeLikelihoods gl;
if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
} else {
gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
}
// for bad bases, there are no likelihoods
if ( gl == null )
return 0;
double[] likelihoods = gl.getLikelihoods();
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double likelihood = likelihoods[g.ordinal()];
log10Likelihoods[g.ordinal()] += likelihood * nObs;
}
return 1;
}
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
return add(obsBase1, qual1, obsBase2, qual2, 1);
}
// -------------------------------------------------------------------------------------
//
// Dealing with the cache routines
//
// -------------------------------------------------------------------------------------
static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_SAM_QUAL_SCORE +1][BaseUtils.BASES.length+1][QualityUtils.MAX_SAM_QUAL_SCORE +1][MAX_PLOIDY];
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null;
}
protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy);
if ( gl == null )
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d",
observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy));
return gl;
}
protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl);
return gl;
}
protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][] cache,
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy,
DiploidSNPGenotypeLikelihoods val ) {
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
int j = qualityScore1;
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
int l = qualityScore2;
int m = ploidy;
cache[i][j][k][l][m] = val;
}
protected DiploidSNPGenotypeLikelihoods getCache(DiploidSNPGenotypeLikelihoods[][][][][] cache,
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
int j = qualityScore1;
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
int l = qualityScore2;
int m = ploidy;
return cache[i][j][k][l][m];
}
protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
try {
DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone();
gl.setToZero();
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
for ( DiploidGenotype g : DiploidGenotype.values() ) {
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
double p_base = 0.0;
p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
final double likelihood = log10(p_base);
gl.log10Likelihoods[g.ordinal()] += likelihood;
}
if ( VERBOSE ) {
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
System.out.println();
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
System.out.println();
}
return gl;
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
}
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase1 the base observed on the 1st read of the fragment
* @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA
* @param observedBase2 the base observed on the 2nd read of the fragment
* @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
*/
protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
double[] log10FourBaseLikelihoods = baseZeros.clone();
for ( byte trueBase : BaseUtils.BASES ) {
double likelihood = 0.0;
for ( byte fragmentBase : BaseUtils.BASES ) {
double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3);
if ( qualityScore1 != 0 ) {
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1);
}
if ( qualityScore2 != 0 ) {
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2);
}
//if ( VERBOSE ) {
// System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n",
// observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood);
//}
likelihood += pow(10, log10FragmentLikelihood);
}
log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood);
}
return log10FourBaseLikelihoods;
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param qual base quality
* @return log10 likelihood
*/
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
double logP;
if ( observedBase == chromBase ) {
// the base is consistent with the chromosome -- it's 1 - e
//logP = oneMinusData[qual];
double e = pow(10, (qual / -10.0));
logP = log10(1.0 - e);
} else {
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
logP = qual / -10.0 + (-log10_3);
}
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
return logP;
}
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
* cap the quality score by the mapping quality of the read itself.
*
* @param p Pileup element
* @param ignoreBadBases Should we ignore bad bases?
* @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read?
* @param minBaseQual Minimum allowed base quality
* @return the actual base quality to use
*/
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
return 0;
byte qual = p.getQual();
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")");
if ( capBaseQualsAtMappingQual )
qual = (byte) Math.min( 0xff & qual, p.getMappingQual());
if ( (int)qual < minBaseQual )
qual = (byte)0;
return qual;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for (DiploidGenotype g : DiploidGenotype.values()) {
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
String bad = null;
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
}
if ( bad != null ) {
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
//
// Constant static data
//
private final static double[] genotypeZeros = new double[DiploidGenotype.values().length];
private final static double[] baseZeros = new double[BaseUtils.BASES.length];
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
genotypeZeros[g.ordinal()] = 0.0;
}
for ( byte base : BaseUtils.BASES ) {
baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
}
}
}

View File

@ -0,0 +1,342 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 7/21/11
* Time: 2:21 PM
*
* This is a site based implementation of an Error Model. The error model is a probability
* distribution for the site given the phred scaled quality.
*/
public class ErrorModel {
private byte maxQualityScore;
private byte minQualityScore;
private byte phredScaledPrior;
private double log10minPower;
private int refDepth;
private boolean hasData = false;
private ProbabilityVector probabilityVector;
private static final boolean compressRange = false;
private static final double log10MinusE = Math.log10(Math.exp(1.0));
private static final boolean DEBUG = false;
/**
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
*
* @param UAC Argument Collection
* @param refSamplePileup Reference sample pileup
* @param refSampleVC VC with True alleles in reference sample pileup
*/
public ErrorModel (final UnifiedArgumentCollection UAC,
final ReadBackedPileup refSamplePileup,
VariantContext refSampleVC, final ReferenceContext refContext) {
this.maxQualityScore = UAC.maxQualityScore;
this.minQualityScore = UAC.minQualityScore;
this.phredScaledPrior = UAC.phredScaledPrior;
log10minPower = Math.log10(UAC.minPower);
PairHMMIndelErrorModel pairModel = null;
LinkedHashMap<Allele, Haplotype> haplotypeMap = null;
double[][] perReadLikelihoods = null;
double[] model = new double[maxQualityScore+1];
Arrays.fill(model,Double.NEGATIVE_INFINITY);
boolean hasCalledAlleles = false;
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
if (refSampleVC != null) {
for (Allele allele : refSampleVC.getAlleles()) {
if (allele.isCalled()) {
hasCalledAlleles = true;
break;
}
}
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
if (refSampleVC.isIndel()) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
}
}
double p = QualityUtils.qualToErrorProbLog10((byte)(maxQualityScore-minQualityScore));
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
// maximum uncertainty if there's no ref data at site
model[q] = p;
}
this.refDepth = 0;
}
else {
hasData = true;
int matches = 0;
int coverage = 0;
Allele refAllele = refSampleVC.getReference();
if ( refSampleVC.isIndel()) {
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
if (!haplotypeMap.isEmpty())
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap);
}
int idx = 0;
for (PileupElement refPileupElement : refSamplePileup) {
if (DEBUG)
System.out.println(refPileupElement.toString());
boolean isMatch = false;
for (Allele allele : refSampleVC.getAlleles()) {
boolean m = pileupElementMatches(refPileupElement, allele, refAllele, refContext.getBase());
if (DEBUG) System.out.println(m);
isMatch |= m;
}
if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) {
// ignore match/mismatch if reads, as determined by their likelihood, are not informative
double[] perAlleleLikelihoods = perReadLikelihoods[idx++];
if (!isInformativeElement(perAlleleLikelihoods))
matches++;
else
matches += (isMatch?1:0);
} else {
matches += (isMatch?1:0);
}
coverage++;
}
int mismatches = coverage - matches;
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
if (coverage==0)
model[q] = p;
else
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
}
this.refDepth = coverage;
}
// compress probability vector
this.probabilityVector = new ProbabilityVector(model, compressRange);
}
@Requires("likelihoods.length>0")
private boolean isInformativeElement(double[] likelihoods) {
// if likelihoods are the same, they're not informative
final double thresh = 0.1;
int maxIdx = MathUtils.maxElementIndex(likelihoods);
int minIdx = MathUtils.minElementIndex(likelihoods);
if (likelihoods[maxIdx]-likelihoods[minIdx]< thresh)
return false;
else
return true;
}
/**
* Simple constructor that just takes a given log-probability vector as error model.
* Only intended for unit testing, not general usage.
* @param pvector Given vector of log-probabilities
*
*/
public ErrorModel(double[] pvector) {
this.maxQualityScore = (byte)(pvector.length-1);
this.minQualityScore = 0;
this.probabilityVector = new ProbabilityVector(pvector, compressRange);
this.hasData = true;
}
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele, byte refBase) {
if (DEBUG)
System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
pileupElement.isBeforeInsertion(),pileupElement.getBasesOfImmediatelyFollowingInsertion(),pileupElement.getLengthOfImmediatelyFollowingIndel(), allele.toString(), refAllele.toString());
//pileupElement.
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
if (allele.isReference()) {
// for a ref allele, any base mismatch or new indel is a mismatch.
if(allele.getBases().length>0)
// todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location
// could clean up/simplify this when unpadding is removed
return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
else
// either null allele to compare, or ref/alt lengths are different (indel by definition).
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
}
// for non-ref alleles to compare:
if (refAllele.getBases().length == allele.getBases().length)
// alleles have the same length (eg snp or mnp)
return pileupElement.getBase() == allele.getBases()[0];
// for non-ref alleles,
byte[] alleleBases = allele.getBases();
int eventLength = alleleBases.length - refAllele.getBases().length;
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getLengthOfImmediatelyFollowingIndel() == -eventLength)
return true;
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
Arrays.equals(pileupElement.getBasesOfImmediatelyFollowingInsertion().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
return true;
return false;
}
/**
* What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
* and assuming each match is independent of each other and that the match probability is just dependent of
* the site quality, so p = 10.^-q/10.
* Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
* to avoid underflows we'll use the Poisson approximation with lambda = N*p.
* Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
* @param q Desired q to get likelihood from
* @param coverage Total coverage
* @param mismatches Number of mismatches
* @return Likelihood of observations as a function of q
*/
@Requires({
"q >= minQualityScore",
"q <= maxQualityScore",
"coverage >= 0",
"mismatches >= 0",
"mismatches <= coverage"
})
private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
// same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
double lambda = QualityUtils.qualToErrorProb(q) * (double )coverage;
// log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
}
@Requires({"qual-minQualityScore <= maxQualityScore"})
public double getSiteLogErrorProbabilityGivenQual (int qual) {
return probabilityVector.getLogProbabilityForIndex(qual);
}
public byte getMaxQualityScore() {
return maxQualityScore;
}
public byte getMinQualityScore() {
return minQualityScore;
}
public int getMinSignificantQualityScore() {
return new ProbabilityVector(probabilityVector,true).getMinVal();
}
public int getMaxSignificantQualityScore() {
return new ProbabilityVector(probabilityVector,true).getMaxVal();
}
public int getReferenceDepth() {
return refDepth;
}
public boolean hasData() {
return hasData;
}
public ProbabilityVector getErrorModelVector() {
return probabilityVector;
}
public String toString() {
StringBuilder result = new StringBuilder("(");
boolean skipComma = true;
for (double v : probabilityVector.getProbabilityVector()) {
if (skipComma) {
skipComma = false;
}
else {
result.append(",");
}
result.append(String.format("%.4f", v));
}
result.append(")");
return result.toString();
}
public static int getTotalReferenceDepth(HashMap<String, ErrorModel> perLaneErrorModels) {
int n=0;
for (ErrorModel e : perLaneErrorModels.values()) {
n += e.getReferenceDepth();
}
return n;
}
/*
@Requires({"maxAlleleCount >= 0"})
//todo -- memoize this function
public boolean hasPowerForMaxAC (int maxAlleleCount) {
int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
double log10CumSum = getCumulativeSum(siteQ);
return log10CumSum < log10minPower;
} */
}

View File

@ -0,0 +1,269 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.variant.variantcontext.Allele;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 5/18/12
* Time: 10:06 AM
* To change this template use File | Settings | File Templates.
*/
public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods {
final PairHMMIndelErrorModel pairModel;
final LinkedHashMap<Allele, Haplotype> haplotypeMap;
final ReferenceContext refContext;
final int eventLength;
double[][] readHaplotypeLikelihoods;
final byte refBase;
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;
public GeneralPloidyIndelGenotypeLikelihoods(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean ignoreLaneInformation,
final PairHMMIndelErrorModel pairModel,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext referenceContext,
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
this.pairModel = pairModel;
this.haplotypeMap = haplotypeMap;
this.refContext = referenceContext;
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
// todo - not needed if indel alleles have base at current position
this.refBase = referenceContext.getBase();
this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap;
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param UAC the minimum base quality at which to consider a base valid
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
int n = 0;
if (!hasReferenceSampleData) {
// no error models
return add(pileup, (ErrorModel)null);
}
for (String laneID : perLaneErrorModels.keySet() ) {
// get pileup for this lane
ReadBackedPileup perLanePileup;
if (ignoreLaneInformation)
perLanePileup = pileup;
else
perLanePileup = pileup.getPileupForLane(laneID);
if (perLanePileup == null || perLanePileup.isEmpty())
continue;
ErrorModel errorModel = perLaneErrorModels.get(laneID);
n += add(perLanePileup, errorModel);
if (ignoreLaneInformation)
break;
}
return n;
}
/**
* Calculates the pool's probability for all possible allele counts for all indel alleles observed.
* Calculation is based on the error model
* generated by the reference sample on the same lane. The probability is given by :
*
* Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
* Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
* [j1 * (1-eq)/2n + eq/3*(2*N-j1)
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
*
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
*
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
*
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
*
* If pileup data comes from several error models (because lanes can have different error models),
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
*
* So, for each lane, build error model and combine lanes.
* To store model, can do
* for jA=0:2N
* for jC = 0:2N-jA
* for jG = 0:2N-jA-jC
* for jT = 0:2N-jA-jC-jG
* Q(jA,jC,jG,jT)
* for k = minSiteQual:maxSiteQual
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
*
*
*
* where: nA,nC,nG,nT = counts of bases observed in pileup.
*
*
* @param pileup Base pileup
* @param errorModel Site error model
* @return Number of bases added
*/
private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
int n=0;
// Number of alleless in pileup, in that order
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
if (!hasReferenceSampleData) {
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap);
n = readHaplotypeLikelihoods.length;
} else {
Allele refAllele = null;
for (Allele a:alleles) {
numSeenBases.add(0);
if (a.isReference())
refAllele = a;
}
if (refAllele == null)
throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
// count number of elements in pileup
for (PileupElement elt : pileup) {
if (VERBOSE)
System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getBasesOfImmediatelyFollowingInsertion(),elt.getLengthOfImmediatelyFollowingIndel());
int idx =0;
for (Allele allele : alleles) {
int cnt = numSeenBases.get(idx);
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele, refBase)?1:0));
}
n++;
}
}
computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
return n;
}
/**
* Compute likelihood of current conformation
*
* @param ACset Count to compute
* @param errorModel Site-specific error model object
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
double p1 = 0.0;
if (!hasReferenceSampleData) {
// no error model: use pair HMM likelihoods
for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
double acc[] = new double[alleleList.size()];
for (int k=0; k < acc.length; k++ )
acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
p1 += MathUtils.log10sumLog10(acc);
}
} else {
final int minQ = errorModel.getMinSignificantQualityScore();
final int maxQ = errorModel.getMaxSignificantQualityScore();
final double[] acVec = new double[maxQ - minQ + 1];
for (int k=minQ; k<=maxQ; k++) {
int idx=0;
for (int n : numObservations)
acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
}
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
}
ACset.getLog10Likelihoods()[0] = p1;
}
}

View File

@ -0,0 +1,141 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.List;
import java.util.Map;
/**
* The model representing how we calculate genotype likelihoods
*/
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
public static final String DUMMY_LANE = "Lane1";
public static final String DUMMY_SAMPLE_NAME = "DummySample1";
/* public enum Model {
SNP,
INDEL,
BOTH
}
*/
public enum Model {
SNP,
INDEL,
GENERALPLOIDYSNP,
GENERALPLOIDYINDEL,
BOTH
}
public enum GENOTYPING_MODE {
/** the Unified Genotyper will choose the most likely alternate allele */
DISCOVERY,
/** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */
GENOTYPE_GIVEN_ALLELES
}
protected final UnifiedArgumentCollection UAC;
protected Logger logger;
/**
* Create a new object
* @param logger logger
* @param UAC unified arg collection
*/
protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
if ( logger == null || UAC == null ) throw new ReviewedStingException("Bad arguments");
this.UAC = UAC;
this.logger = logger;
}
/**
* Can be overridden by concrete subclasses
*
* @param tracker rod data
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param allAllelesToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser
* @return variant context where genotypes are no-called but with GLs
*/
public abstract VariantContext getLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser,
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if ( BaseUtils.isRegularBase( p.getBase() ) )
count++;
}
return count;
}
}

View File

@ -0,0 +1,262 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.variant.variantcontext.*;
import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private static final int HAPLOTYPE_SIZE = 80;
private boolean DEBUG = false;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private PairHMMIndelErrorModel pairModel;
private LinkedHashMap<Allele, Haplotype> haplotypeMap;
private List<Allele> alleleList = new ArrayList<Allele>();
protected IndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC,
final Logger logger) {
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
}
protected static List<Allele> computeConsensusAlleles(final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final UnifiedArgumentCollection UAC) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
GenomeLoc loc = ref.getLocus();
// if (!ref.getLocus().equals(lastSiteVisited)) {
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
// starting a new site: clear allele list
haplotypeMap.clear();
perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels);
if (alleleList.isEmpty())
return null;
}
getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); // will update haplotypeMap adding elements
if (haplotypeMap == null || haplotypeMap.isEmpty())
return null;
// start making the VariantContext
// For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base.
final int endLoc = loc.getStart() + alleleList.get(0).length() - 1;
final int eventLength = getEventLength(alleleList);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList);
// create the genotypes; no-call everyone for now
GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
// For each sample, get genotype likelihoods based on pileup
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
// no likelihoods have been computed for this sample at this site
perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap());
}
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
b.PL(genotypeLikelihoods);
b.DP(getFilteredDepth(pileup));
genotypes.add(b.make());
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
for (int k = 0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
}
}
}
return builder.genotypes(genotypes).make();
}
public static void getHaplotypeMapFromAlleles(final List<Allele> alleleList,
final ReferenceContext ref,
final GenomeLoc loc,
final LinkedHashMap<Allele, Haplotype> haplotypeMap) {
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
haplotypeMap.clear();
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
haplotypeMap.clear();
else if (alleleList.isEmpty())
haplotypeMap.clear();
else {
final int eventLength = getEventLength(alleleList);
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
if (hsize <= 0) // protect against event lengths larger than ref window sizes
haplotypeMap.clear();
else
haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases));
}
}
public static int getEventLength(List<Allele> alleleList) {
Allele refAllele = alleleList.get(0);
Allele altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
}
return altAllele.getBaseString().length() - refAllele.getBaseString().length();
}
public static List<Allele> getInitialAlleleList(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final UnifiedArgumentCollection UAC,
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
List<Allele> alleles = new ArrayList<Allele>();
if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vc = null;
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
if (vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return alleles;
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleles.add(a);
} else {
alleles.addAll(vc.getAlleles());
}
} else {
alleles = computeConsensusAlleles(ref, contexts, contextType, UAC);
}
return alleles;
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for (PileupElement p : pileup) {
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()))
count++;
}
return count;
}
}

View File

@ -0,0 +1,844 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.variant.variantcontext.*;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
import java.util.*;
public class UnifiedGenotyperEngine {
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
private static final String GPSTRING = "GENERALPLOIDY";
public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
private static final int SNP_MODEL = 0;
private static final int INDEL_MODEL = 1;
public enum OUTPUT_MODE {
/** produces calls only at variant sites */
EMIT_VARIANTS_ONLY,
/** produces calls at variant sites and confident reference sites */
EMIT_ALL_CONFIDENT_SITES,
/** produces calls at any callable site regardless of confidence; this argument is intended only for point
* mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
* no means produce a comprehensive set of indels in DISCOVERY mode */
EMIT_ALL_SITES
}
// the unified argument collection
private final UnifiedArgumentCollection UAC;
public UnifiedArgumentCollection getUAC() { return UAC; }
// the annotation engine
private final VariantAnnotatorEngine annotationEngine;
// the model used for calculating genotypes
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
private final List<GenotypeLikelihoodsCalculationModel.Model> modelsToUse = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2);
// the model used for calculating p(non-ref)
private ThreadLocal<AFCalc> afcm = new ThreadLocal<AFCalc>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
// samples in input
private final Set<String> samples;
// the various loggers and writers
private final Logger logger;
private final PrintStream verboseWriter;
// number of chromosomes (ploidy * samples) in input
private final int ploidy;
private final int N;
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine;
// ---------------------------------------------------------------------------------------------------------
//
// Public interface functions
//
// ---------------------------------------------------------------------------------------------------------
@Requires({"toolkit != null", "UAC != null"})
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY);
}
protected UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, Set<String> samples, UnifiedArgumentCollection UAC) {
this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
}
@Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"})
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples, int ploidy) {
this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF;
genomeLocParser = toolkit.getGenomeLocParser();
this.samples = new TreeSet<String>(samples);
// note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ
this.UAC = UAC;
this.logger = logger;
this.verboseWriter = verboseWriter;
this.annotationEngine = engine;
this.ploidy = ploidy;
this.N = samples.size() * ploidy;
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior);
filter.add(LOW_QUAL_FILTER_NAME);
determineGLModelsToUse();
// do argument checking
if (UAC.annotateAllSitesWithPLs) {
if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP))
throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs");
}
}
/**
* @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set)
*
* same as the full call but with allSamples == null
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext) {
return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null);
}
/**
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
*
* If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
* for every sample in allSamples. If it's null there's no such guarentee. Providing this
* argument is critical when the resulting calls will be written to a VCF file.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
* @return the VariantCallContext object
*/
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext,
final Set<String> allSamples) {
final List<VariantCallContext> results = new ArrayList<VariantCallContext>(2);
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, refContext, rawContext);
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap = new HashMap<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap>();
if ( models.isEmpty() ) {
results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null);
}
else {
for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) {
perReadAlleleLikelihoodMap.clear();
final Map<String, AlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model);
if ( stratifiedContexts == null ) {
results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null);
}
else {
final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
if ( vc != null )
results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set
// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
// results.add(generateEmptyContext(tracker, refContext, null, rawContext));
}
}
}
return results;
}
/**
* Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels)
* @return the VariantContext object
*/
public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext,
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, refContext, rawContext);
if ( models.isEmpty() ) {
return null;
}
for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) {
final Map<String, AlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model);
// return the first valid one we encounter
if ( stratifiedContexts != null )
return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
}
return null;
}
/**
* Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param vc the GL-annotated variant context
* @return the VariantCallContext object
*/
public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext,
final VariantContext vc) {
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, refContext, rawContext);
if ( models.isEmpty() ) {
return null;
}
// return the first one
final GenotypeLikelihoodsCalculationModel.Model model = models.get(0);
final Map<String, AlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model);
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null);
}
/**
* Compute genotypes at a given locus.
*
* @param vc the GL-annotated variant context
* @return the VariantCallContext object
*/
public VariantCallContext calculateGenotypes(VariantContext vc) {
return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null);
}
// ---------------------------------------------------------------------------------------------------------
//
// Private implementation helpers
//
// ---------------------------------------------------------------------------------------------------------
// private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine
private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final Map<String, AlignmentContext> stratifiedContexts,
final AlignmentContextUtils.ReadOrientation type,
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenotypeLikelihoodsCalculationModel.Model model,
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
// initialize the data for this thread if that hasn't been done yet
if ( glcm.get() == null ) {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap);
}
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
VariantContext vc;
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
if ( vcInput == null )
return null;
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()).make();
} else {
// deal with bad/non-standard reference bases
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
return null;
Set<Allele> alleles = new HashSet<Allele>();
alleles.add(Allele.create(ref.getBase(), true));
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make();
}
if ( annotationEngine != null ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc);
}
return new VariantCallContext(vc, false);
}
public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap);
}
public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
return calculateGenotypes(null, null, null, null, vc, model, null);
}
public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final GenotypeLikelihoodsCalculationModel.Model model,
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap);
}
/**
* Main entry function to calculate genotypes of a given VC with corresponding GL's
* @param tracker Tracker
* @param refContext Reference context
* @param rawContext Raw context
* @param stratifiedContexts Stratified alignment contexts
* @param vc Input VC
* @param model GL calculation model
* @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc
* @return VC with assigned genotypes
*/
public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext,
final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model,
final boolean inheritAttributesFromInputVC,
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
// TODO TODO TODO TODO
// REFACTOR THIS FUNCTION, TOO UNWIELDY!!
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
}
// if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call
if (!canVCbeGenotyped(vc)) {
if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext)
return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext);
else
return null;
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 ) {
if ( limitedContext )
return null;
return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ?
estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) :
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
// determine which alternate alleles have AF>0
final List<Allele> myAlleles = new ArrayList<>(vc.getAlleles().size());
final List<Integer> alleleCountsofMLE = new ArrayList<>(vc.getAlleles().size());
myAlleles.add(vc.getReference());
for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) {
final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i);
if ( alternateAllele.isReference() )
continue;
// Compute if the site is considered polymorphic with sufficient confidence relative to our
// phred-scaled emission QUAL
final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
final boolean toInclude = isNonRef || alternateAllele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE ||
UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
UAC.annotateAllSitesWithPLs;
bestGuessIsRef &= !isNonRef;
if (toInclude) {
myAlleles.add(alternateAllele);
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
}
}
final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());
// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
final double phredScaledConfidence =
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs
? -10 * AFresult.getLog10PosteriorOfAFEq0()
: -10 * AFresult.getLog10PosteriorOfAFGT0());
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
}
// start constructing the resulting VC
final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) )
builder.filters(filter);
// create the genotypes
final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy);
builder.genotypes(genotypes);
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>();
// inherit attributed from input vc if requested
if (inheritAttributesFromInputVC)
attributes.putAll(vc.getAttributes());
// if the site was downsampled, record that fact
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());
// add the MLE AC and AF annotations
if ( alleleCountsofMLE.size() > 0 ) {
attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
final int AN = builder.make().getCalledChrCount();
final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size());
// the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1)
for ( int AC : alleleCountsofMLE )
MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN));
attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
}
if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) {
//final boolean DEBUG_SLOD = false;
// the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
final double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
final List<Allele> allAllelesToUse = builder.make().getAlleles();
// the forward lod
final VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
final AFCalcResult forwardAFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model));
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
final double forwardLog10PofNull = forwardAFresult.getLog10LikelihoodOfAFEq0();
final double forwardLog10PofF = forwardAFresult.getLog10LikelihoodOfAFGT0();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
final VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
final AFCalcResult reverseAFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model));
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
final double reverseLog10PofNull = reverseAFresult.getLog10LikelihoodOfAFEq0();
final double reverseLog10PofF = reverseAFresult.getLog10LikelihoodOfAFGT0();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
final double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
final double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
//if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
// strand score is max bias between forward and reverse strands
double strandScore = Math.max(forwardLod, reverseLod);
// rescale by a factor of 10
strandScore *= 10.0;
//logger.debug(String.format("SLOD=%f", strandScore));
if ( !Double.isNaN(strandScore) )
attributes.put("SB", strandScore);
}
// finish constructing the resulting VC
builder.attributes(attributes);
VariantContext vcCall = builder.make();
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
}
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall);
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
}
/**
* Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed.
* @param vc Input VC
* @return Status check
*/
@Requires("vc != null")
protected boolean canVCbeGenotyped(final VariantContext vc) {
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart());
return false;
}
else return true;
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
if ( !BaseUtils.isRegularBase(refContext.getBase()) )
return null;
Map<String, AlignmentContext> stratifiedContexts = null;
if ( model.name().contains("INDEL") ) {
final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE);
// don't call when there is no coverage
if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
} else if ( model.name().contains("SNP") ) {
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup());
if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) {
int numDeletions = 0;
for ( final PileupElement p : rawContext.getBasePileup() ) {
if ( p.isDeletion() )
numDeletions++;
}
if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) {
return null;
}
}
}
return stratifiedContexts;
}
private final double getRefBinomialProbLog10(final int depth) {
return MathUtils.log10BinomialProbability(depth, 0);
}
private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
if ( contexts == null )
return null;
double log10POfRef = Math.log10(initialPofRef);
// for each sample that we haven't examined yet
for ( String sample : samples ) {
final AlignmentContext context = contexts.get(sample);
if ( ignoreCoveredSamples && context != null )
continue;
final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage();
log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta);
}
return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false);
}
/**
* Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference
*
* Assumes the sample is diploid
*
* @param depth the depth of the sample
* @param theta the heterozygosity of this species (between 0 and 1)
* @return a valid log10 probability of the sample being hom-ref
*/
@Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"})
@Ensures("MathUtils.goodLog10Probability(result)")
protected double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) {
final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth);
return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
}
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) {
Allele refAllele = null, altAllele = null;
for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() )
refAllele = allele;
else
altAllele = allele;
}
for (int i = 0; i <= N; i++) {
StringBuilder AFline = new StringBuilder("AFINFO\t");
AFline.append(pos);
AFline.append("\t");
AFline.append(refAllele);
AFline.append("\t");
if ( altAllele != null )
AFline.append(altAllele);
else
AFline.append("N/A");
AFline.append("\t");
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
verboseWriter.println(AFline.toString());
}
verboseWriter.println("P(f>0) = " + PofF);
verboseWriter.println("Qscore = " + phredScaledConfidence);
verboseWriter.println();
}
protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) {
return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
}
protected boolean passesCallThreshold(double conf) {
return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING;
}
protected boolean confidentlyCalled(double conf, double PofF) {
return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
(UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
}
private void determineGLModelsToUse() {
String modelPrefix = "";
if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY )
modelPrefix = GPSTRING;
// GGA mode => must initialize both the SNP and indel models
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP"));
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL"));
}
else {
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase()));
}
}
// decide whether we are currently processing SNPs, indels, neither, or both
private List<GenotypeLikelihoodsCalculationModel.Model> getGLModelsToUse(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext) {
if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES )
return modelsToUse;
if ( modelsToUse.size() != 2 )
throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse);
// if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested
final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
if ( vcInput == null ) {
return Collections.emptyList(); // no work to be done
} else if ( vcInput.isSNP() ) {
return Collections.singletonList(modelsToUse.get(SNP_MODEL));
} else if ( vcInput.isIndel() || vcInput.isMixed() ) {
return Collections.singletonList(modelsToUse.get(INDEL_MODEL));
} else {
return Collections.emptyList(); // No support for other types yet
}
}
/**
* Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
* where Pr(AC=i) = theta/i where theta is heterozygosity
* @param N Number of chromosomes
* @param priors (output) array to be filled with priors
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
*/
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List<Double> inputPriors) {
double sum = 0.0;
if (!inputPriors.isEmpty()) {
// user-specified priors
if (inputPriors.size() != N)
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
int idx = 1;
for (final double prior: inputPriors) {
if (prior < 0.0)
throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
priors[idx++] = Math.log10(prior);
sum += prior;
}
}
else {
// for each i
for (int i = 1; i <= N; i++) {
final double value = heterozygosity / (double)i;
priors[i] = Math.log10(value);
sum += value;
}
}
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
if (sum > 1.0) {
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
}
protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
if (model.name().toUpperCase().contains("SNP"))
return log10AlleleFrequencyPriorsSNPs;
else if (model.name().toUpperCase().contains("INDEL"))
return log10AlleleFrequencyPriorsIndels;
else
throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
}
protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) {
if( model.name().contains("SNP") )
return HUMAN_SNP_HETEROZYGOSITY;
if( model.name().contains("INDEL") )
return HUMAN_INDEL_HETEROZYGOSITY;
else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
}
private static Map<String, GenotypeLikelihoodsCalculationModel> getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
final Map<String, GenotypeLikelihoodsCalculationModel> glcm = new HashMap<String, GenotypeLikelihoodsCalculationModel>();
final List<Class<? extends GenotypeLikelihoodsCalculationModel>> glmClasses = new PluginManager<GenotypeLikelihoodsCalculationModel>(GenotypeLikelihoodsCalculationModel.class).getPlugins();
for (int i = 0; i < glmClasses.size(); i++) {
final Class<? extends GenotypeLikelihoodsCalculationModel> glmClass = glmClasses.get(i);
final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase();
try {
final Object args[] = new Object[]{UAC,logger};
final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class);
glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args));
}
catch (Exception e) {
throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage());
}
}
return glcm;
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
if ( tracker == null || ref == null || logger == null )
return null;
VariantContext vc = null;
// search for usable record
for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) {
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
if ( vc == null ) {
vc = vc_input;
} else {
logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record");
}
}
}
return vc;
}
}

View File

@ -0,0 +1,333 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.variant.variantcontext.GenotypesContext;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
public abstract class DiploidExactAFCalc extends ExactAFCalc {
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy);
}
@Override
protected AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final int numAlternateAlleles = vc.getNAlleles() - 1;
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final LinkedList<ExactACset> ACqueue = new LinkedList<>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<>(numChr+1);
// add AC=0 to the queue
final int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
while ( !ACqueue.isEmpty() ) {
getStateTracker().incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors);
// clean up memory
indexesToACset.remove(set.getACcounts());
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
@Override
protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List<Allele> allelesToUse) {
return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
@Override
protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) {
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
final int alleleLikelihoodIndex1 = alleles.alleleIndex1 - 1;
final int alleleLikelihoodIndex2 = alleles.alleleIndex2 - 1;
if ( alleles.alleleIndex1 != 0 )
likelihoodSums[alleleLikelihoodIndex1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
// don't double-count it
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleleLikelihoodIndex2].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
}
}
}
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors);
final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// can we abort early because the log10Likelihoods are so small?
if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
final int numAltAlleles = set.getACcounts().getCounts().length;
// add conformations for the k+1 case
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final ArrayList<double[]> genotypeLikelihoods) {
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// push data from the dependency to the new set
//if ( DEBUG )
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
}
private void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors) {
set.getLog10Likelihoods()[0] = 0.0; // the zero case
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( totalK == 0 ) {
for ( int j = 1; j < set.getLog10Likelihoods().length; j++ )
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX];
set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator;
}
double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// update the MLE if necessary
getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
// apply the priors over each alternate allele
for ( final int ACcount : set.getACcounts().getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
}
private void pushData(final ExactACset targetSet,
final ExactACset dependentSet,
final int PLsetIndex,
final ArrayList<double[]> genotypeLikelihoods) {
final int totalK = targetSet.getACsum();
for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue =
determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex];
targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue);
}
}
}
private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
// find the 2 alleles that are represented by this PL index
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
// the AX het case
if ( alleles.alleleIndex1 == 0 )
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
final int k_i = ACcounts[alleles.alleleIndex1-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[alleles.alleleIndex2-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return allelesToUse.size() == 1
? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy)
: GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse,
assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
}

View File

@ -0,0 +1,240 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.*;
import java.util.*;
/**
* Uses the Exact calculation of Heng Li
*/
abstract class ExactAFCalc extends AFCalc {
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
/**
* Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first.
*/
protected static final Comparator<LikelihoodSum> LIKELIHOOD_SUM_COMPARATOR = new Comparator<LikelihoodSum>() {
@Override
public int compare(final LikelihoodSum o1, final LikelihoodSum o2) {
return - Double.compare(o1.sum,o2.sum);
}
};
/**
* Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first but make sure that
* NON_REF alleles are place are last.
*/
protected static final Comparator<LikelihoodSum> LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR = new Comparator<LikelihoodSum>() {
@Override
public int compare(final LikelihoodSum o1, final LikelihoodSum o2) {
if (o1.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)
return 1;
else if (o2.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)
return -1;
else
return o1.compareTo(o2);
}
};
/**
* Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with lower alternative allele index are first regardless of
* the likelihood sum.
*/
protected static final Comparator<LikelihoodSum> LIKELIHOOD_INDEX_COMPARATOR = new Comparator<LikelihoodSum>() {
@Override
public int compare(final LikelihoodSum o1, final LikelihoodSum o2) {
return Integer.compare(o1.index, o2.index);
}
};
protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
}
/**
* Wrapper class that compares two likelihoods associated with two alleles
*/
protected static final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public final Allele allele;
public final int index;
public LikelihoodSum(final Allele allele, final int index) { this.allele = allele; this.index = index; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
/**
* Unpack GenotypesContext into arraylist of doubel values
* @param GLs Input genotype context
* @return ArrayList of doubles corresponding to GL vectors
*/
protected static ArrayList<double[]> getGLs(final GenotypesContext GLs, final boolean includeDummy) {
final ArrayList<double[]> genotypeLikelihoods = new ArrayList<>(GLs.size() + 1);
if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
if ( sample.hasLikelihoods() ) {
final double[] gls = sample.getLikelihoods().getAsVector();
if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
@Override
protected VariantContext reduceScope(final VariantContext vc) {
// don't try to genotype too many alternate alleles
final List<Allele> inputAltAlleles = vc.getAlternateAlleles();
final List<Allele> outputAltAlleles = reduceScopeAlleles(vc,getMaxAltAlleles());
// only if output allele has reduced from the input alt allele set size we should care.
final int altAlleleReduction = inputAltAlleles.size() - outputAltAlleles.size();
if (altAlleleReduction == 0)
return vc;
else if (altAlleleReduction != 0) {
logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles()
+ " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart()
+ " has " + (vc.getAlternateAlleles().size())
+ " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
final List<Allele> alleles = new ArrayList<>(getMaxAltAlleles() + 1);
alleles.add(vc.getReference());
alleles.addAll(reduceScopeAlleles(vc, getMaxAltAlleles()));
final VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.alleles(alleles);
builder.genotypes(reduceScopeGenotypes(vc, alleles));
if (altAlleleReduction < 0)
throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc + " " + builder.make());
return builder.make();
} else // if (altAlleleReduction < 0)
throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc);
}
/**
* Returns a the new set of alleles to use.
* @param vc target variant context.
* @param numAllelesToChoose number of alleles to keep.
* @return the list of alternative allele to keep.
*/
protected List<Allele> reduceScopeAlleles(final VariantContext vc, final int numAllelesToChoose) {
// Look for the <NON_REF> allele to exclude it from the pruning if present.
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final int nonRefAltAlleleIndex = GATKVariantContextUtils.indexOfAltAllele(vc,
GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE, false);
final boolean nonRefAltAllelePresent = nonRefAltAlleleIndex >= 0;
// <NON_REF> should not be considered in the downsizing, so we need to count it out when
// considering if alt. allele downsizing is required.
final int numProperOriginalAltAlleles = numOriginalAltAlleles - (nonRefAltAllelePresent ? 1 : 0);
// Avoid pointless allele reduction:
if (numAllelesToChoose >= numProperOriginalAltAlleles)
return vc.getAlternateAlleles();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
final Allele allele = vc.getAlternateAllele(i);
likelihoodSums[i] = new LikelihoodSum(allele,i);
}
// Calculate the allele likelihood sums.
reduceScopeCalculateLikelihoodSums(vc, likelihoodSums);
// sort them by probability mass and choose the best ones
// Make sure that the <NON_REF> allele is last if present.
Collections.sort(Arrays.asList(likelihoodSums), nonRefAltAllelePresent ? LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR : LIKELIHOOD_SUM_COMPARATOR);
// We need to return the best likelihood alleles in the original alternative allele index order.
// This heap will keep track of that index order.
final PriorityQueue<LikelihoodSum> mostLikelyAllelesHeapByIndex = new PriorityQueue<>(numOriginalAltAlleles, LIKELIHOOD_INDEX_COMPARATOR);
for ( int i = 0; i < numAllelesToChoose; i++ )
mostLikelyAllelesHeapByIndex.add(likelihoodSums[i]);
// guaranteed no to have been added at this point thanks for checking on whether reduction was
// needed in the first place.
if (nonRefAltAllelePresent)
mostLikelyAllelesHeapByIndex.add(likelihoodSums[nonRefAltAlleleIndex]);
final ArrayList<Allele> orderedBestAlleles = new ArrayList<>(numAllelesToChoose);
while (!mostLikelyAllelesHeapByIndex.isEmpty())
orderedBestAlleles.add(mostLikelyAllelesHeapByIndex.remove().allele);
return orderedBestAlleles;
}
protected static final int PL_INDEX_OF_HOM_REF = 0;
/**
* Update the likelihood sums with using the variant context genotype likelihoods.
* @param vc source variant context.
* @param likelihoodSums where to update the likelihood sums.
*/
protected abstract void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums);
/**
* Transforms the genotypes of the variant context according to the new subset of possible alleles.
*
* @param vc original variant-context.
* @param allelesToUse possible alleles.
* @return never {@code null}, the new set of genotype calls for the reduced scope.
*/
protected abstract GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List<Allele> allelesToUse);
}

View File

@ -0,0 +1,590 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.variant.variantcontext.*;
import java.util.*;
public class GeneralPloidyExactAFCalc extends ExactAFCalc {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
private final int ploidy;
private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
this.ploidy = ploidy;
}
@Override
protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List<Allele> allelesToUse) {
return subsetAlleles(vc,allelesToUse,false,ploidy);
}
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors);
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
/**
* Simple wrapper class to hold values of combined pool likelihoods.
* For fast hashing and fast retrieval, there's a hash map that shadows main list.
*
*/
static class CombinedPoolLikelihoods {
private LinkedList<ExactACset> alleleCountSetList;
private HashMap<ExactACcounts,ExactACset> conformationMap;
private double maxLikelihood;
public CombinedPoolLikelihoods() {
// final int numElements = GenotypeLikelihoods.numLikelihoods();
alleleCountSetList = new LinkedList<>();
conformationMap = new HashMap<>();
maxLikelihood = Double.NEGATIVE_INFINITY;
}
public void add(ExactACset set) {
alleleCountSetList.add(set);
conformationMap.put(set.getACcounts(), set);
final double likelihood = set.getLog10Likelihoods()[0];
if (likelihood > maxLikelihood )
maxLikelihood = likelihood;
}
public boolean hasConformation(int[] ac) {
return conformationMap.containsKey(new ExactACcounts(ac));
}
public double getLikelihoodOfConformation(int[] ac) {
return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0];
}
public double getGLOfACZero() {
return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list
}
public int getLength() {
return alleleCountSetList.size();
}
}
@Override
protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) {
final int numOriginalAltAlleles = likelihoodSums.length;
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), false);
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL);
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
for (int k=1; k < acCount.length;k++)
if (acCount[k] > 0 )
likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]);
}
}
/**
* Simple non-optimized version that combines GLs from several pools and produces global AF distribution.
* @param GLs Inputs genotypes context with per-pool GLs
* @param numAlleles Number of alternate alleles
* @param ploidyPerPool Number of samples per pool
* @param log10AlleleFrequencyPriors Frequency priors
*/
protected void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
int combinedPloidy = 0;
// Combine each pool incrementally - likelihoods will be renormalized at each step
CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods();
// first element: zero ploidy, e.g. trivial degenerate distribution
final int[] zeroCounts = new int[numAlleles];
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
set.getLog10Likelihoods()[0] = 0.0;
combinedPoolLikelihoods.add(set);
if ( genotypeLikelihoods.size() <= 1 ) {
// no meaningful GLs at all, just set the tracker to non poly values
getStateTracker().reset(); // just mimic-ing call below
getStateTracker().setLog10LikelihoodOfAFzero(0.0);
} else {
for (int p=1; p<genotypeLikelihoods.size(); p++) {
getStateTracker().reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p),
combinedPloidy, ploidyPerPool, numAlleles, log10AlleleFrequencyPriors);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
}
}
public CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool,
double[] newGL,
int originalPloidy,
int newGLPloidy,
int numAlleles,
final double[] log10AlleleFrequencyPriors) {
final LinkedList<ExactACset> ACqueue = new LinkedList<>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<>();
final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods();
// add AC=0 to the queue
final int[] zeroCounts = new int[numAlleles];
final int newPloidy = originalPloidy + newGLPloidy;
zeroCounts[0] = newPloidy;
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
while ( !ACqueue.isEmpty() ) {
getStateTracker().incNEvaluations();
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset);
// clean up memory
indexesToACset.remove(ACset.getACcounts());
if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
}
return newPool;
}
// todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods
/**
*
* @param set ExactACset holding conformation to be computed
* @param newPool New pool likelihood holder
* @param originalPool Original likelihood holder
* @param newGL New pool GL vector to combine
* @param log10AlleleFrequencyPriors Prior object
* @param originalPloidy Total ploidy of original combined pool
* @param newGLPloidy Ploidy of GL vector
* @param ACqueue Queue of conformations to compute
* @param indexesToACset AC indices of objects in queue
* @return max log likelihood
*/
private double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool,
final double[] newGL,
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
// compute likeihood in "set" of new set based on original likelihoods
final int numAlleles = set.getACcounts().getCounts().length;
final int newPloidy = set.getACsum();
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy);
// add to new pool
if (!Double.isInfinite(log10LofK))
newPool.add(set);
// TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) {
return log10LofK;
}
// iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
final int ACwiggle = set.getACcounts().getCounts()[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
// add conformations for other cases
for ( int allele = 1; allele < numAlleles; allele++ ) {
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++;
// is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
ACcountsClone[0] = newPloidy - altSum;
if (ACcountsClone[0] < 0)
continue;
GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
}
return log10LofK;
}
// /**
// * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
// * Math is generalization of biallelic combiner.
// *
// * For vector K representing an allele count conformation,
// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
// * @param originalPool First log-likelihood pool GL vector
// * @param yy Second pool GL vector
// * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
// * @param ploidy2 Ploidy of second pool
// * @param numAlleles Number of alleles
// * @param log10AlleleFrequencyPriors Array of biallelic priors
// * @param resultTracker Af calculation result object
// */
// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
// final double[] log10AlleleFrequencyPriors,
// final AFCalcResultTracker resultTracker) {
///*
// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
//
// if (dim1 != originalPool.getLength() || dim2 != yy.length)
// throw new ReviewedStingException("BUG: Inconsistent vector length");
//
// if (ploidy2 == 0)
// return;
//
// final int newPloidy = ploidy1 + ploidy2;
//
// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
// final double[] x = originalPool.getLikelihoodsAsVector(true);
// while(firstIterator.hasNext()) {
// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
// firstIterator.next();
// }
//
// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
// final double[] y = yy.clone();
// while(secondIterator.hasNext()) {
// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
// secondIterator.next();
// }
//
// // initialize output to -log10(choose(m1+m2,[k1 k2...])
// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
//
//
// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
// while(outputIterator.hasNext()) {
// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
//
// originalPool.add(likelihood, set, outputIterator.getLinearIndex());
// outputIterator.next();
// }
//*/
// }
/**
* Compute likelihood of a particular AC conformation and update AFresult object
* @param set Set of AC counts to compute
* @param firstGLs Original pool likelihoods before combining
* @param secondGL New GL vector with additional pool
* @param log10AlleleFrequencyPriors Allele frequency priors
* @param numAlleles Number of alleles (including ref)
* @param ploidy1 Ploidy of original pool (combined)
* @param ploidy2 Ploidy of new pool
* @return log-likehood of requested conformation
*/
private double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs,
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2) {
final int newPloidy = ploidy1 + ploidy2;
// sanity check
int totalAltK = set.getACsum();
if (newPloidy != totalAltK)
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
totalAltK -= set.getACcounts().getCounts()[0];
// totalAltK has sum of alt alleles of conformation now
// special case for k = 0 over all k
if ( totalAltK == 0 ) { // all-ref case
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
set.getLog10Likelihoods()[0] = log10Lof0;
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return log10Lof0;
} else {
// initialize result with denominator
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
int[] currentCount = set.getACcounts().getCounts();
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
// for current conformation, get all possible ways to break vector K into two components G1 and G2
final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY;
while (innerIterator.hasNext()) {
// check if breaking current conformation into g1 and g2 is feasible.
final int[] acCount2 = innerIterator.getCurrentVector();
final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2);
final int idx2 = innerIterator.getLinearIndex();
// see if conformation is valid and if original pool had this conformation
// for conformation to be valid, all elements of g2 have to be <= elements of current AC set
if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) {
final double gl2 = secondGL[idx2];
if (!Double.isInfinite(gl2)) {
final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1);
final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1);
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
final double sum = firstGL + gl2 + num1 + num2;
set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum);
}
}
innerIterator.next();
}
set.getLog10Likelihoods()[0] += denom;
}
double log10LofK = set.getLog10Likelihoods()[0];
// update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
// apply the priors over each alternate allele
for (final int ACcount : altCounts ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
return log10LofK;
}
/**
* Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy?
* @param set AC conformation vector
* @param ploidy Ploidy of set
* @return Valid conformation
*/
private static boolean isValidConformation(final int[] set, final int ploidy) {
int sum=0;
for (final int ac: set) {
if (ac < 0)
return false;
sum += ac;
}
return (sum == ploidy);
}
/**
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
* including updating the PL's, and assign genotypes accordingly
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes true: assign hard genotypes, false: leave as no-call
* @param ploidy number of chromosomes per sample (pool)
* @return GenotypesContext with new PLs
*/
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
// the genotypes with PLs
final GenotypesContext oldGTs = vc.getGenotypes();
List<Allele> NO_CALL_ALLELES = new ArrayList<>(ploidy);
for (int k=0; k < ploidy; k++)
NO_CALL_ALLELES.add(Allele.NO_CALL);
// samples
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext.create();
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final int numNewAltAlleles = allelesToUse.size() - 1;
// create the new genotypes
for ( int k = 0; k < oldGTs.size(); k++ ) {
final Genotype g = oldGTs.get(sampleIndices.get(k));
if ( !g.hasLikelihoods() ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
continue;
}
// create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods;
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
// and subsetting
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
newLikelihoods = originalLikelihoods;
} else {
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
}
else {
final GenotypeBuilder gb = new GenotypeBuilder(g);
if ( numNewAltAlleles == 0 )
gb.noPL();
else
gb.PL(newLikelihoods);
// if we weren't asked to assign a genotype, then just no-call the sample
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL )
gb.alleles(NO_CALL_ALLELES);
else
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
newGTs.add(gb.make());
}
}
return newGTs;
}
/**
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
*
* @param newLikelihoods the PL array
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
* @param numChromosomes Number of chromosomes per pool
*/
private void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
final int numNewAltAlleles = allelesToUse.size() - 1;
// find the genotype with maximum likelihoods
final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
final ArrayList<Double> alleleFreqs = new ArrayList<>();
final ArrayList<Integer> alleleCounts = new ArrayList<>();
for (int k=1; k < mlAlleleCount.length; k++) {
alleleCounts.add(mlAlleleCount[k]);
final double freq = (double)mlAlleleCount[k] / (double)numChromosomes;
alleleFreqs.add(freq);
}
// per-pool logging of AC and AF
gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
// remove PLs if necessary
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
gb.noPL();
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
// add list of called ML genotypes to alleles list
// TODO - too unwieldy?
int idx = 0;
for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) {
for (int k=0; k < mlAlleleCount[mlind]; k++)
myAlleles.add(idx++,allelesToUse.get(mlind));
}
gb.alleles(myAlleles);
// TODO - deprecated so what is the appropriate method to call?
if ( numNewAltAlleles > 0 )
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
}
}

View File

@ -0,0 +1,496 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.variant.variantcontext.*;
import java.util.*;
/**
* Computes the conditional bi-allelic exact results
*
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
*
* (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything]
*
* it then computes the conditional probability on AF_c == 0:
*
* (2) P(D | AF_t > 0 && AF_c == 0)
*
* Thinking about this visually, we have the following likelihood matrix where each cell is
* the P(D | AF_c == i && AF_t == j):
*
* 0 AF_c > 0
* -----------------
* 0 | |
* |--|-------------
* a | |
* f | |
* _ | |
* t | |
* > | |
* 0 | |
*
* What we really want to know how
*
* (3) P(D | AF_c == 0 & AF_t == 0)
*
* compares with
*
* (4) P(D | AF_c > 0 || AF_t > 0)
*
* This is effectively asking for the value in the upper left vs. the sum of all cells.
*
* This class implements the conditional likelihoods summation for any number of alt
* alleles, where each alt allele has its EXACT probability of segregating calculated by
* reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows:
*
* Suppose we have for a A/B/C site the following GLs:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B
*
* XX = AA + AC + CC (since X = A or C)
* XB = AB + BC
* BB = BB
*
* After each allele has its probability calculated we compute the joint posterior
* as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i
* prior for the ith least likely allele.
*/
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
/**
* The min. confidence of an allele to be included in the joint posterior.
*/
private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10);
private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0};
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
/**
* Sorts AFCalcResults by their posteriors of AF > 0, so the
*/
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
@Override
public int compare(AFCalcResult o1, AFCalcResult o2) {
return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0());
}
}
private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef();
/**
* The AFCalc model we are using to do the bi-allelic computation
*/
final AFCalc biAlleleExactModel;
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy);
}
/**
* Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call
*/
private static class MyAFCalcResult extends AFCalcResult {
/**
* List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call
*/
final List<AFCalcResult> supporting;
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pRefByAllele, List<AFCalcResult> supporting) {
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele);
this.supporting = supporting;
}
}
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final List<AFCalcResult> independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors);
if ( independentResultTrackers.size() == 0 )
throw new IllegalStateException("Independent alleles model returned an empty list of results at VC " + vc);
if ( independentResultTrackers.size() == 1 ) {
// fast path for the very common bi-allelic use case
return independentResultTrackers.get(0);
} else {
// we are a multi-allelic, so we need to actually combine the results
final List<AFCalcResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
return combineIndependentPNonRefs(vc, withMultiAllelicPriors);
}
}
/**
* Compute the conditional exact AFCalcResult for each allele in vc independently, returning
* the result of each, in order of the alt alleles in VC
*
* @param vc the VariantContext we want to analyze, with at least 1 alt allele
* @param log10AlleleFrequencyPriors the priors
* @return a list of the AFCalcResults for each bi-allelic sub context of vc
*/
@Requires({"vc != null", "log10AlleleFrequencyPriors != null"})
@Ensures("goodIndependentResult(vc, result)")
protected final List<AFCalcResult> computeAlleleIndependentExact(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final List<AFCalcResult> results = new LinkedList<AFCalcResult>();
for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) {
final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors);
results.add(resultTracker);
}
return results;
}
/**
* Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results
*/
private static boolean goodIndependentResult(final VariantContext vc, final List<AFCalcResult> results) {
if ( results.size() != vc.getNAlleles() - 1) return false;
for ( int i = 0; i < results.size(); i++ ) {
if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 )
return false;
if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) )
return false;
}
return true;
}
/**
* Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order
*
* @param vc the variant context to split. Must have n.alt.alleles > 1
* @return a bi-allelic variant context for each alt allele in vc
*/
@Requires({"vc != null", "vc.getNAlleles() > 1"})
@Ensures("result.size() == vc.getNAlleles() - 1")
protected final List<VariantContext> makeAlleleConditionalContexts(final VariantContext vc) {
final int nAltAlleles = vc.getNAlleles() - 1;
if ( nAltAlleles == 1 ) {
// fast path for bi-allelic case.
return Collections.singletonList(vc);
} else {
// go through the work of ripping up the VC into its biallelic components
final List<VariantContext> vcs = new LinkedList<VariantContext>();
for ( int altI = 0; altI < nAltAlleles; altI++ ) {
vcs.add(biallelicCombinedGLs(vc, altI + 1));
}
return vcs;
}
}
/**
* Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex
*
* @param rootVC the root (potentially multi-allelic) variant context
* @param altAlleleIndex index of the alt allele, from 0 == first alt allele
* @return a bi-allelic variant context based on rootVC
*/
@Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"})
@Ensures({"result.isBiallelic()"})
protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) {
if ( rootVC.isBiallelic() ) {
return rootVC;
} else {
final int nAlts = rootVC.getNAlleles() - 1;
final List<Genotype> biallelicGenotypes = new ArrayList<Genotype>(rootVC.getNSamples());
for ( final Genotype g : rootVC.getGenotypes() )
biallelicGenotypes.add(combineGLsPrecise(g, altAlleleIndex, nAlts));
final VariantContextBuilder vcb = new VariantContextBuilder(rootVC);
final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1);
vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele));
vcb.genotypes(biallelicGenotypes);
return vcb.make();
}
}
/**
* Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case
*
* This is handled in the following way:
*
* Suppose we have for a A/B/C site the following GLs:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B
*
* XX = AA + AC + CC (since X = A or C)
* XB = AB + BC
* BB = BB
*
* @param original the original multi-allelic genotype
* @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0
* @param nAlts the total number of alt alleles
* @return a new biallelic genotype with appropriate PLs
*/
@Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"})
@Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"})
@Deprecated
protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) {
if ( original.isNonInformative() )
return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make();
if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts);
final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector());
final double[] biAllelicPr = new double[3];
for ( int index = 0; index < normalizedPr.length; index++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index);
if ( pair.alleleIndex1 == altIndex ) {
if ( pair.alleleIndex2 == altIndex )
// hom-alt case
biAllelicPr[2] = normalizedPr[index];
else
// het-alt case
biAllelicPr[1] += normalizedPr[index];
} else {
if ( pair.alleleIndex2 == altIndex )
// het-alt case
biAllelicPr[1] += normalizedPr[index];
else
// hom-non-alt
biAllelicPr[0] += normalizedPr[index];
}
}
final double[] GLs = new double[3];
for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]);
return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make();
}
private static final double PHRED_2_LOG10_COEFF = -.1;
/**
* Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case.
*
* <p>Uses the log-sum-exp trick in order to work well with very low PLs</p>
*
* <p>This is handled in the following way:</p>
*
* <p>Suppose we have for a A/B/C site the following GLs:</p>
*
* <p>AA AB BB AC BC CC</p>
*
* <p>and we want to get the bi-allelic GLs for X/B, where X is everything not B</p>
*
* <p>XX = AA + AC + CC (since X = A or C)<br/>
* XB = AB + BC <br/>
* BB = BB <br/>
* </p>
* <p>
* This implementation use the log sum trick in order to avoid numeric inestability.
* </p>
*
* @param original the original multi-allelic genotype
* @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0
* @param nAlts the total number of alt alleles
* @return a new biallelic genotype with appropriate PLs
*/
@Requires({"original.hasLikelihoods()"})
@Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"})
protected Genotype combineGLsPrecise(final Genotype original, final int altIndex, final int nAlts ) {
if ( original.isNonInformative() )
return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make();
if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts);
final int[] pls = original.getPL();
final int nAlleles = nAlts + 1;
final int plCount = pls.length;
double BB = 0;
final double[] XBvalues = new double[nAlleles - 1];
final double[] XXvalues = new double[plCount - nAlleles];
int xbOffset = 0;
int xxOffset = 0;
for ( int index = 0; index < plCount; index++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index);
int i = pair.alleleIndex1;
int j = pair.alleleIndex2;
if (i == j) {
if (i == altIndex) BB = PHRED_2_LOG10_COEFF * pls[index]; else XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index];
} else if (i == altIndex || j == altIndex)
XBvalues[xbOffset++] = PHRED_2_LOG10_COEFF * pls[index];
else
XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index];
}
final double XB = MathUtils.log10sumLog10(XBvalues);
final double XX = MathUtils.log10sumLog10(XXvalues);
final double[] GLs = new double[] { XX, XB, BB};
return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make();
}
protected final List<AFCalcResult> applyMultiAllelicPriors(final List<AFCalcResult> conditionalPNonRefResults) {
final ArrayList<AFCalcResult> sorted = new ArrayList<AFCalcResult>(conditionalPNonRefResults);
// sort the results, so the most likely allele is first
Collections.sort(sorted, compareAFCalcResultsByPNonRef);
double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0();
final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0();
for ( int i = 0; i < sorted.size(); i++ ) {
if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 )
throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0());
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0));
final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 };
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true)));
}
return sorted;
}
/**
* Take the independent estimates of pNonRef for each alt allele and combine them into a single result
*
* Given n independent calculations for each of n alternate alleles create a single
* combined AFCalcResult with:
*
* priors for AF == 0 equal to theta^N for the nth least likely allele
* posteriors that reflect the combined chance that any alleles are segregating and corresponding
* likelihoods
* combined MLEs in the order of the alt alleles in vc
*
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
*/
protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc,
final List<AFCalcResult> sortedResultsWithThetaNPriors) {
int nEvaluations = 0;
final int nAltAlleles = sortedResultsWithThetaNPriors.size();
final int[] alleleCountsOfMLE = new int[nAltAlleles];
final double[] log10PriorsOfAC = new double[2];
final Map<Allele, Double> log10pRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
// the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs
double log10PosteriorOfACEq0Sum = 0.0;
double log10PosteriorOfACGt0Sum = 0.0;
boolean anyPoly = false;
for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
final int altI = vc.getAlleles().indexOf(altAllele) - 1;
// MLE of altI allele is simply the MLE of this allele in altAlleles
alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele);
// the AF > 0 case requires us to store the normalized likelihood for later summation
if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) {
anyPoly = true;
log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0();
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
}
log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0();
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0());
// trivial -- update the number of evaluations
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
}
// If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation
if ( ! anyPoly ) {
log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0();
log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0();
}
// In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C,
// the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently
// log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0
//
// note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we
// use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where
// AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO
final double log10PosteriorOfACGt0;
if ( log10PosteriorOfACEq0Sum == 0.0 )
log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum;
else
log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO);
final double[] log10LikelihoodsOfAC = new double[] {
// L + prior = posterior => L = poster - prior
log10PosteriorOfACEq0Sum - log10PriorsOfAC[0],
log10PosteriorOfACGt0 - log10PriorsOfAC[1]
};
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
// necessary to ensure all values < 0
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true),
// priors incorporate multiple alt alleles, must be normalized
MathUtils.normalizeFromLog10(log10PriorsOfAC, true),
log10pRefByAllele, sortedResultsWithThetaNPriors);
}
}

View File

@ -0,0 +1,538 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.*;
/**
* Helper component to manage active region trimming
*
* <p/>
* It receives the user arguments that controls trimming and also performs the trimming region calculation.
*
* @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.org&gt;
*/
class ActiveRegionTrimmer {
/**
* Genome location parser use in order to create and manipulate genomic intervals.
*/
private GenomeLocParser locParser;
/**
* Holds the debug flag. If {@code true} the trimmer will output debugging messages into the log.
*/
private boolean debug;
/**
* Holds the extension to be used based on whether GGA mode is on or off.
*/
private int usableExtension;
/**
* Records whether the trimming intervals are going to be used to emit reference confidence, {@code true},
* or regular HC output {@code false}.
*/
private boolean emitReferenceConfidence;
@Hidden
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
/**
* the maximum extent into the full active region extension that we're willing to go in genotyping our events
*/
@Hidden
@Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false)
protected int discoverExtension = 25;
@Hidden
@Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false)
protected int ggaExtension = 300;
/**
* Include at least this many bases around an event for calling it
*/
@Hidden
@Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false)
protected int indelPadding = 150;
@Hidden
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
protected int snpPadding = 20;
/**
* Holds a reference the trimmer logger.
*/
private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class);
/**
* Initializes the trimmer.
*
* <p/>
* This method should be called once and only once before any trimming is performed.
*
*
* @param glp the genome-location-parser to be used when operating with genomic locations.
* @param debug whether to show extra debug log messages.
* @param isGGA whether the trimming region calculator should act as if we are in GGA mode or not.
* @param emitReferenceConfidence indicates whether we plan to use this trimmer to generate trimmed regions
* to be used for emitting reference confidence.
*
* @throws IllegalStateException if this trim calculator has already been initialized.
* @throws IllegalArgumentException if the input location parser is {@code null}.
* @throws UserException.BadArgumentValue if any of the user argument values is invalid.
*/
void initialize(final GenomeLocParser glp, final boolean debug, final boolean isGGA, final boolean emitReferenceConfidence) {
if (locParser != null)
throw new IllegalStateException(getClass().getSimpleName() + " instance initialized twice");
if (glp == null)
throw new IllegalArgumentException("input genome-loc-parser cannot be null");
checkUserArguments();
locParser = glp;
this.debug = debug;
usableExtension = isGGA ? ggaExtension : discoverExtension;
this.emitReferenceConfidence = emitReferenceConfidence;
}
/**
* Checks user trimming argument values
*
* @throws UserException.BadArgumentValue if there is some problem with any of the arguments values.
*/
private void checkUserArguments() {
if ( snpPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundSNPs","" + snpPadding + "< 0");
if ( indelPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundIndels","" + indelPadding + "< 0");
if ( discoverExtension < 0) throw new UserException.BadArgumentValue("maxDiscARExtension","" + discoverExtension + "< 0");
if ( ggaExtension < 0) throw new UserException.BadArgumentValue("maxGGAAREExtension","" + ggaExtension + "< 0");
}
/**
* Holds the result of trimming.
*
*
*
*/
public static class Result {
/**
* Indicates whether trimming is required per data and user request.
*/
protected final boolean needsTrimming;
/**
* Holds the input active region.
*/
protected final ActiveRegion originalRegion;
/**
* Holds the smaller range that contain all relevant callable variants in the
* input active region (not considering the extension).
*
*/
protected final GenomeLoc callableSpan;
/**
* Maximum available range for the trimmed variant region.
*/
protected final GenomeLoc maximumSpan;
/**
* The trimmed variant region span including the extension.
*/
protected final GenomeLoc extendedSpan;
/**
* The ideal trimmer variant region span including the extension.
*/
protected final GenomeLoc idealSpan;
/**
* Returns the ideal trimming span.
*
* <p/>
* The ideal span is the one containing all callable variation overlapping the original active region span
* (without extension) and the applicable padding {@link #getPadding()} in both sides.
*
*
* @return never {@code null}.
*/
@SuppressWarnings("unused")
public GenomeLoc getIdealSpan() {
return idealSpan;
}
/**
* Holds the flanking spans that do not contain the callable variants.
* <p/>
* The first element of the pair is the left (up-stream) non-variant flank, whereas the second element is
* the right (down-stream) non-variant flank.
*/
protected final Pair<GenomeLoc,GenomeLoc> nonVariantFlanks;
/**
* Holds the collection of callable events within the variant trimming region.
*/
protected final List<VariantContext> callableEvents;
/**
* Required padding around the variant trimming region.
*/
protected final int padding;
/**
* Returns the required padding around callable variation.
*
* <p/>
* Notice that due to the limiting span of the original active region (including its extension) it
* is possible that the resulting final trimmed variant region span does not satisfies the padding. However
* that should be rare.
*
* @return 0 or greater.
*/
@SuppressWarnings("unused")
public int getPadding() {
return padding;
}
/**
* Holds the maximum extension around the original active region span considered for the trimmed
* variation region.
*/
protected final int usableExtension;
/**
* Returns the maximum extension around the original active region span considered for the trimmed
* variation region.
*
* <p/>
* From time to time, the trimmed region may require a span beyond the input original active region's.
* For example when there is a callable event close ot one of its ends and the required padding makes it
* round beyond that limit.
*
* <p/>
* Notice that due to the limiting span of the original active region (including its extended region) it
* is possible that the resulting final trimmed variant region span goes beyond this extension including more of
* the original active region own extension.
*
* @return 0 or greater.
*/
@SuppressWarnings("unused")
public int getUsableExtension() {
return usableExtension;
}
/**
* Holds variant-containing callable region.
* <p/>
* This is lazy-initialized using {@link #callableSpan}.
*/
protected ActiveRegion callableRegion;
/**
* Non-variant left flank region.
* <p/>
* This is lazy-initialized using
* {@link #nonVariantFlanks}.{@link Pair#getFirst() getFirst()}.
*/
private ActiveRegion leftFlankRegion;
/**
* Non-variant right flank region.
* <p/>
* This is lazy-initialized using
* {@link #nonVariantFlanks}.{@link Pair#getFirst() getSecond()}.
*/
private ActiveRegion rightFlankRegion;
/**
* Whether the variant trimmed region is going to be used for emitting reference confidence records.
*/
private final boolean emitReferenceConfidence;
/**
* Creates a trimming result given all its properties.
*
* @param emitReferenceConfidence whether reference confidence output modes are on.
* @param needsTrimming whether there is any trimming needed at all.
* @param originalRegion the original active region.
* @param padding padding around contained callable variation events.
* @param extension the extension applied to the trimmed variant span.
* @param overlappingEvents contained callable variation events.
* @param nonVariantFlanks pair of non-variant flank spans around the variant containing span.
* @param extendedSpan final trimmed variant span including the extension.
* @param idealSpan the ideal span, that contains.
* @param maximumSpan maximum possible trimmed span based on the input original active region extended span.
* @param callableSpan variant containing span without padding.
*/
protected Result(final boolean emitReferenceConfidence, final boolean needsTrimming, final ActiveRegion originalRegion,
final int padding, final int extension,
final List<VariantContext> overlappingEvents, final Pair<GenomeLoc,GenomeLoc> nonVariantFlanks,
final GenomeLoc extendedSpan,
final GenomeLoc idealSpan,
final GenomeLoc maximumSpan,
final GenomeLoc callableSpan) {
this.emitReferenceConfidence = emitReferenceConfidence;
this.needsTrimming = needsTrimming;
this.originalRegion = originalRegion;
this.nonVariantFlanks = nonVariantFlanks;
this.padding = padding;
this.usableExtension = extension;
this.callableEvents = overlappingEvents;
this.callableSpan = callableSpan;
this.idealSpan = idealSpan;
this.maximumSpan = maximumSpan;
this.extendedSpan = extendedSpan;
if (!extendedSpan.isUnmapped() && !callableSpan.isUnmapped() && !extendedSpan.containsP(callableSpan))
throw new IllegalArgumentException("the extended callable span must include the callable span");
}
/**
* Checks whether there is any variation present in the target region.
*
* @return {@code true} if there is any variant, {@code false} otherwise.
*/
public boolean isVariationPresent() {
return ! callableEvents.isEmpty();
}
/**
* Checks whether the active region needs trimming.
*/
public boolean needsTrimming() {
return needsTrimming;
}
/**
* Returns the trimmed variant containing region
*
* @throws IllegalStateException if there is no variation detected.
*
* @return never {@code null}.
*/
public ActiveRegion getCallableRegion() {
if (callableRegion == null && !extendedSpan.isUnmapped())
//TODO this conditional is a patch to retain the current standard HC run behaviour
//TODO we should simply remove this difference between trimming with or without GVCF
//TODO embracing slight changes in the standard HC output
callableRegion = emitReferenceConfidence ? originalRegion.trim(callableSpan, extendedSpan) : originalRegion.trim(extendedSpan);
else if (extendedSpan.isUnmapped())
throw new IllegalStateException("there is no variation thus no variant region");
return callableRegion;
}
/**
* Checks whether there is a non-empty left flanking non-variant trimmed out region.
* @return {@code true} if there is a non-trivial left flank region, {@code false} otherwise.
*/
public boolean hasLeftFlankingRegion() {
return ! nonVariantFlanks.getFirst().isUnmapped();
}
/**
* Checks whether there is a non-empty right flanking non-variant trimmed out region.
* @return {@code true} if there is a non-trivial right flank region, {@code false} otherwise.
*/
public boolean hasRightFlankingRegion() {
return ! nonVariantFlanks.getSecond().isUnmapped();
}
/**
* Returns the trimmed out left non-variant region.
* <p/>
* Notice that in case of no variation, the whole original region is considered the left flanking region.
*
* @throws IllegalStateException if there is not such as left flanking region.
*/
public ActiveRegion nonVariantLeftFlankRegion() {
if (leftFlankRegion == null && ! nonVariantFlanks.getFirst().isUnmapped())
leftFlankRegion = originalRegion.trim(nonVariantFlanks.getFirst(),originalRegion.getExtension());
else if (nonVariantFlanks.getFirst().isUnmapped())
throw new IllegalStateException("there is no left flank non-variant trimmed out region");
return leftFlankRegion;
}
/**
* Returns the trimmed out right non-variant region.
*/
public ActiveRegion nonVariantRightFlankRegion() {
if (rightFlankRegion == null && ! nonVariantFlanks.getSecond().isUnmapped())
rightFlankRegion = originalRegion.trim(nonVariantFlanks.getSecond(),originalRegion.getExtension());
else if (nonVariantFlanks.getSecond().isUnmapped())
throw new IllegalStateException("there is no right flank non-variant trimmed out region");
return rightFlankRegion;
}
/**
* Creates a result indicating that there was no trimming to be done.
*/
protected static Result noTrimming(final boolean emitReferenceConfidence,
final ActiveRegion targetRegion, final int padding,
final int usableExtension,final List<VariantContext> events) {
final GenomeLoc targetRegionLoc = targetRegion.getLocation();
final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension,events,new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED),
targetRegionLoc,targetRegionLoc,targetRegionLoc,targetRegionLoc);
result.callableRegion = targetRegion;
return result;
}
/**
* Creates a result indicating that no variation was found.
*/
protected static Result noVariation(final boolean emitReferenceConfidence, final ActiveRegion targetRegion,
final int padding, final int usableExtension) {
final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension,
Collections.<VariantContext>emptyList(),new Pair<>(targetRegion.getLocation(),GenomeLoc.UNMAPPED),
GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED);
result.leftFlankRegion = targetRegion;
return result;
}
}
/**
* Returns a trimming result object from which the variant trimmed region and flanking non-variant sections
* can be recovered latter.
*
* @param originalRegion the genome location range to trim.
* @param allVariantsWithinExtendedRegion list of variants contained in the trimming location. Variants therein
* not overlapping with {@code originalRegion} are simply ignored.
* @return never {@code null}.
*/
public Result trim(final ActiveRegion originalRegion,
final TreeSet<VariantContext> allVariantsWithinExtendedRegion) {
if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants,
return Result.noVariation(emitReferenceConfidence,originalRegion,snpPadding, usableExtension);
final List<VariantContext> withinActiveRegion = new LinkedList<>();
final GenomeLoc originalRegionRange = originalRegion.getLocation();
boolean foundNonSnp = false;
GenomeLoc variantSpan = null;
for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
final GenomeLoc vcLoc = locParser.createGenomeLoc(vc);
if ( originalRegionRange.overlapsP(vcLoc) ) {
foundNonSnp = foundNonSnp || ! vc.isSNP();
variantSpan = variantSpan == null ? vcLoc : variantSpan.endpointSpan(vcLoc);
withinActiveRegion.add(vc);
}
}
final int padding = foundNonSnp ? indelPadding : snpPadding;
// we don't actually have anything in the region after skipping out variants that don't overlap
// the region's full location
if ( variantSpan == null )
return Result.noVariation(emitReferenceConfidence,originalRegion,padding, usableExtension);
if ( dontTrimActiveRegions)
return Result.noTrimming(emitReferenceConfidence,originalRegion, padding, usableExtension, withinActiveRegion);
final GenomeLoc maximumSpan = locParser.createPaddedGenomeLoc(originalRegionRange, usableExtension);
final GenomeLoc idealSpan = locParser.createPaddedGenomeLoc(variantSpan, padding);
final GenomeLoc finalSpan = maximumSpan.intersect(idealSpan).union(variantSpan);
// Make double sure that, if we are emitting GVCF we won't call non-variable positions beyond the target active region span.
// In regular call we don't do so so we don't care and we want to maintain behavior, so the conditional.
final GenomeLoc callableSpan = emitReferenceConfidence ? variantSpan.intersect(originalRegionRange) : variantSpan;
final Pair<GenomeLoc,GenomeLoc> nonVariantRegions = nonVariantTargetRegions(originalRegion, callableSpan);
if ( debug ) {
logger.info("events : " + withinActiveRegion);
logger.info("region : " + originalRegion);
logger.info("callableSpan : " + callableSpan);
logger.info("padding : " + padding);
logger.info("idealSpan : " + idealSpan);
logger.info("maximumSpan : " + maximumSpan);
logger.info("finalSpan : " + finalSpan);
}
return new Result(emitReferenceConfidence,true,originalRegion,padding, usableExtension,withinActiveRegion,nonVariantRegions,finalSpan,idealSpan,maximumSpan,variantSpan);
}
/**
* Calculates the list of region to trim away.
* @param targetRegion region for which to generate the flanking regions.
* @param variantSpan the span of the core region containing relevant variation and required padding.
* @return never {@code null}; 0, 1 or 2 element list.
*/
private Pair<GenomeLoc,GenomeLoc> nonVariantTargetRegions(final ActiveRegion targetRegion, final GenomeLoc variantSpan) {
final GenomeLoc targetRegionRange = targetRegion.getLocation();
final int finalStart = variantSpan.getStart();
final int finalStop = variantSpan.getStop();
final int targetStart = targetRegionRange.getStart();
final int targetStop = targetRegionRange.getStop();
final boolean preTrimmingRequired = targetStart < finalStart;
final boolean postTrimmingRequired = targetStop > finalStop;
if (preTrimmingRequired) {
final String contig = targetRegionRange.getContig();
return postTrimmingRequired ? new Pair<>(
locParser.createGenomeLoc(contig, targetStart, finalStart - 1),
locParser.createGenomeLoc(contig, finalStop + 1, targetStop)) :
new Pair<>(locParser.createGenomeLoc(contig, targetStart, finalStart - 1),GenomeLoc.UNMAPPED);
} else if (postTrimmingRequired)
return new Pair<>(GenomeLoc.UNMAPPED,locParser.createGenomeLoc(targetRegionRange.getContig(), finalStop + 1, targetStop));
else
return new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED);
}
}

View File

@ -0,0 +1,543 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.collections.CountSet;
import org.broadinstitute.sting.utils.haplotype.EventMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.haplotype.HaplotypeSizeAndBaseComparator;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.*;
/**
* Collection of read assembly using several kmerSizes.
*
* <p>
* There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those
* assemblies.
* </p>
*
* <p>
* Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one.
* </p>
*
* @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.com&gt;
*/
public class AssemblyResultSet {
private final Map<Integer,AssemblyResult> assemblyResultByKmerSize;
private final Set<Haplotype> haplotypes;
private final Map<Haplotype,AssemblyResult> assemblyResultByHaplotype;
private ActiveRegion regionForGenotyping;
private byte[] fullReferenceWithPadding;
private GenomeLoc paddedReferenceLoc;
private boolean variationPresent;
private Haplotype refHaplotype;
private boolean wasTrimmed = false;
private final CountSet kmerSizes;
private TreeSet<VariantContext> variationEvents;
private boolean debug;
private static Logger logger = Logger.getLogger(AssemblyResultSet.class);
/**
* Constructs a new empty assembly result set.
*/
public AssemblyResultSet() {
assemblyResultByKmerSize = new LinkedHashMap<>(4);
haplotypes = new LinkedHashSet<>(10);
assemblyResultByHaplotype = new LinkedHashMap<>(10);
kmerSizes = new CountSet(4);
}
/**
* Change the debug status for this assembly-result-set.
* @param newValue new value for the debug status.
*/
void setDebug(final boolean newValue) {
debug = newValue;
}
/**
* Trims an assembly result set down based on a new set of trimmed haplotypes.
*
* @param trimmedActiveRegion the trimmed down active region.
*
* @throws NullPointerException if any argument in {@code null} or
* if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys.
* @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones.
*
* @return never {@code null}, a new trimmed assembly result set.
*/
public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion) {
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes = calculateOriginalByTrimmedHaplotypes(trimmedActiveRegion);
if (refHaplotype == null) throw new IllegalStateException();
if (trimmedActiveRegion == null) throw new NullPointerException();
final AssemblyResultSet result = new AssemblyResultSet();
for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) {
final Haplotype original = originalByTrimmedHaplotypes.get(trimmed);
if (original == null)
throw new NullPointerException("all trimmed haplotypes must have an original one");
final AssemblyResult as = assemblyResultByHaplotype.get(original);
if (as == null) result.add(trimmed); else result.add(trimmed, as);
}
result.setRegionForGenotyping(trimmedActiveRegion);
result.setFullReferenceWithPadding(this.fullReferenceWithPadding);
result.setPaddedReferenceLoc(this.paddedReferenceLoc);
if (result.refHaplotype == null)
throw new IllegalStateException("missing reference haplotype in the trimmed set");
result.wasTrimmed = true;
return result;
}
private Map<Haplotype, Haplotype> calculateOriginalByTrimmedHaplotypes(final ActiveRegion trimmedActiveRegion) {
if ( debug ) logger.info("Trimming active region " + getRegionForGenotyping() + " with " + getHaplotypeCount() + " haplotypes");
final List<Haplotype> haplotypeList = getHaplotypeList();
// trim down the haplotypes
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes = new HashMap<>();
for ( final Haplotype h : haplotypeList ) {
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
if ( trimmed != null ) {
if (originalByTrimmedHaplotypes.containsKey(trimmed)) {
if (trimmed.isReference()) {
originalByTrimmedHaplotypes.remove(trimmed);
originalByTrimmedHaplotypes.put(trimmed, h);
}
} else
originalByTrimmedHaplotypes.put(trimmed,h);
} else if (h.isReference())
throw new IllegalStateException("trimming eliminates the reference haplotype");
else if ( debug ) {
logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() +
" because it starts with or ends with an insertion or deletion when trimmed to " +
trimmedActiveRegion.getExtendedLoc());
}
}
// create the final list of trimmed haplotypes
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet());
// resort the trimmed haplotypes.
Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator());
final Map<Haplotype,Haplotype> sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size());
for (final Haplotype trimmed : trimmedHaplotypes)
sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed));
if ( debug ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " +
trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " +
haplotypeList.size() + " to only " + trimmedHaplotypes.size());
if ( debug )
for ( final Haplotype remaining: trimmedHaplotypes )
logger.info("Remains: " + remaining + " cigar " + remaining.getCigar());
return sortedOriginalByTrimmedHaplotypes;
}
/**
* Query the reference haplotype in the result set.
* @return {@code null} if none wasn't yet added, otherwise a reference haplotype.
*/
public Haplotype getReferenceHaplotype() {
return refHaplotype;
}
/**
* Checks whether there is any variation present in the assembly result set.
*
* <p>
* This is equivalent to whether there is more than one haplotype.
* </p>
*
* @return {@code true} if there is variation present, {@code false} otherwise.
*/
public boolean isVariationPresent() {
return variationPresent && haplotypes.size() > 1;
}
/**
* Dumps debugging information into a print-writer.
*
* @param pw where to dump the information.
*
* @throws NullPointerException if {@code pw} is {@code null}.
*/
public void debugDump(final PrintWriter pw) {
if (getHaplotypeList().size() == 0) {
return;
}
pw.println("Active Region " + this.regionForGenotyping.getLocation());
pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc());
pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation());
pw.println("Haplotype count " + haplotypes.size());
final Map<Integer,Integer> kmerSizeToCount = new HashMap<>();
for (final Map.Entry<Haplotype,AssemblyResult> e : assemblyResultByHaplotype.entrySet()) {
final AssemblyResult as = e.getValue();
final int kmerSize = as.getGraph().getKmerSize();
if (kmerSizeToCount.containsKey(kmerSize)) {
kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1);
} else {
kmerSizeToCount.put(kmerSize,1);
}
}
pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() );
Integer[] kmerSizes = new Integer[kmerSizeToCount.size()];
kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes);
Arrays.sort(kmerSizes);
pw.println("Kmer sizes values " + Arrays.toString(kmerSizes));
for (int size : kmerSizes) {
pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size));
}
}
/**
* Adds a haplotype to the result set without indicating a generating assembly result.
*
* <p>
* It is possible to call this method with the same haplotype several times. In that the second and further
* calls won't have any effect (thus returning {@code false}).
* </p>
*
* @param h the haplotype to add to the assembly result set.
*
* @throws NullPointerException if {@code h} is {@code null}
* @throws IllegalArgumentException if {@code h} does not have a genome location.
*
* @return {@code true} if the assembly result set has been modified as a result of this call.
*/
public boolean add(final Haplotype h) {
if (h == null) throw new NullPointerException("input haplotype cannot be null");
if (h.getGenomeLocation() == null)
throw new IllegalArgumentException("the haplotype provided must have a genomic location");
if (haplotypes.contains(h))
return false;
haplotypes.add(h);
updateReferenceHaplotype(h);
return true;
}
/**
* Adds simultaneously a haplotype and the generating assembly-result.
*
* <p>
* Haplotypes and their assembly-result can be added multiple times although just the first call will have
* any effect (return value is {@code true}).
* </p>
*
*
* @param h haplotype to add.
* @param ar assembly-result that is assumed to have given rise to that haplotype.
*
* @throws NullPointerException if {@code h} or {@code ar} is {@code null}.
* @throws IllegalArgumentException if {@code h} has not defined genome location.
*
* @return {@code true} iff this called changes the assembly result set.
*/
public boolean add(final Haplotype h, final AssemblyResult ar) {
if (h == null) throw new NullPointerException("input haplotype cannot be null");
if (ar == null) throw new NullPointerException("input assembly-result cannot be null");
if (h.getGenomeLocation() == null)
throw new IllegalArgumentException("the haplotype provided must have a genomic location");
final boolean assemblyResultAdditionReturn = add(ar);
if (haplotypes.contains(h)) {
final AssemblyResult previousAr = assemblyResultByHaplotype.get(h);
if (previousAr == null) {
assemblyResultByHaplotype.put(h, ar);
return true;
} else if (!previousAr.equals(ar))
throw new IllegalStateException("there is already a different assembly result for the input haplotype");
else
return assemblyResultAdditionReturn;
} else {
haplotypes.add(h);
assemblyResultByHaplotype.put(h,ar);
updateReferenceHaplotype(h);
if (h.isNonReference()) variationPresent = true;
return true;
}
}
/**
* Add a assembly-result object.
*
* @param ar the assembly result to add.
*
* @throws NullPointerException if {@code ar} is {@code null}.
* @throws IllegalStateException if there is an assembly result with the same kmerSize.
* @return {@code true} iff this addition changed the assembly result set.
*/
public boolean add(final AssemblyResult ar) {
if (ar == null)
throw new NullPointerException();
final int kmerSize = ar.getKmerSize();
if (assemblyResultByKmerSize.containsKey(kmerSize)) {
if (!assemblyResultByKmerSize.get(kmerSize).equals(ar))
throw new IllegalStateException("a different assembly result with the same kmerSize was already added");
return false;
} else {
assemblyResultByKmerSize.put(kmerSize, ar);
kmerSizes.add(kmerSize);
return true;
}
}
/**
* Returns the current region for genotyping.
*
* @return might be {@code null}.
*/
public ActiveRegion getRegionForGenotyping() {
return regionForGenotyping;
}
/**
* Sets the region for genotyping.
*
* @param regionForGenotyping the new value.
*/
public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) {
this.regionForGenotyping = regionForGenotyping;
}
/**
* Returns the current full reference with padding.
*
* @return might be {@code null}.
*/
public byte[] getFullReferenceWithPadding() {
return fullReferenceWithPadding;
}
/**
* Sets the full reference with padding base sequence.
*
* @param fullReferenceWithPadding the new value.
*/
public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) {
this.fullReferenceWithPadding = fullReferenceWithPadding;
}
/**
* Returns the padded reference location.
*
* @return might be {@code null}
*/
public GenomeLoc getPaddedReferenceLoc() {
return paddedReferenceLoc;
}
/**
* Changes the padded reference location.
* @param paddedReferenceLoc the new value.
*/
public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) {
this.paddedReferenceLoc = paddedReferenceLoc;
}
/**
* Returns the number of haplotypes in the assembly result set.
* @return {@code 0} or greater.
*/
public int getHaplotypeCount() {
return haplotypes.size();
}
/**
* Returns the haplotypes as a list.
*
* <p>
* The result is unmodifiable.
* </p>
*
* @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly.
*/
public List<Haplotype> getHaplotypeList() {
return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()]));
}
/**
* Returns the maximum kmerSize available.
*
* @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize.
*
* @return greater than 0.
*/
public int getMaximumKmerSize() {
if (kmerSizes.size() == 0)
throw new IllegalStateException("there is yet no kmerSize in this assembly result set");
return kmerSizes.max();
}
/**
* Indicates whether there are more than one kmerSize in the set.
*
* @return {@code true} iff there is more than one kmerSize assembly in the set.
*/
public boolean hasMultipleKmerSizes() {
return kmerSizes.size() > 1;
}
/**
* Returns the minimum kmerSize available.
*
* @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize.
*
* @return greater than 0.
*/
public int getMinimumKmerSize() {
if (kmerSizes.size() == 0)
throw new IllegalStateException("there is yet no kmerSize in this assembly result set");
return kmerSizes.min();
}
/**
* Returns a read-threading graph in the assembly set that has a particular kmerSize.
*
* @param kmerSize the requested kmerSize.
*
* @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize.
*/
public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) {
final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize);
if (assemblyResult == null) return null;
return assemblyResult.getThreadingGraph();
}
/**
* Checks whether this assembly result set was trimmed.
*
* @return {@code true} iff this assembly result set was trimmed.
*/
public boolean wasTrimmed() {
return wasTrimmed;
}
/**
* Marks the assembly as not having variation even if it has more than one haplotype.
*/
public void resetVariationPresent() {
variationPresent = false;
}
/**
* Dumps debugging information into a logger.
*
* @param logger where to dump the information.
*
* @throws NullPointerException if {@code logger} is {@code null}.
*/
public void debugDump(final Logger logger) {
final StringWriter sw = new StringWriter();
final PrintWriter pw = new PrintWriter(sw);
debugDump(pw);
final String str = sw.toString();
final String[] lines = str.split("\n");
for (final String line : lines) {
if (line.isEmpty()) {
continue;
}
logger.debug(line);
}
}
/**
* Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the
* reference haplotype and updates {@link #refHaplotype} accordingly.
*
* <p>
* This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes}
* I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference
* haplotype and this has already be set. This is the case even if the new haplotypes and the current reference
* are equal.
* </p>
*
* @param newHaplotype the new haplotype.
* @throws NullPointerException if {@code newHaplotype} is {@code null}.
* @throws IllegalStateException if there is already a reference haplotype.
*/
private void updateReferenceHaplotype(final Haplotype newHaplotype) {
if (!newHaplotype.isReference()) return;
if (refHaplotype == null)
refHaplotype = newHaplotype;
else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality.
throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different");
}
/**
* Returns a sorted set of variant events that best explain the haplotypes found by the assembly
* across kmerSizes.
*
* <p/>
* The result is sorted incrementally by location.
*
* @return never {@code null}, but perhaps an empty collection.
*/
public TreeSet<VariantContext> getVariationEvents() {
if (variationEvents == null) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList,fullReferenceWithPadding,paddedReferenceLoc,debug);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);
}
return variationEvents;
}
}

Some files were not shown because too many files have changed in this diff Show More